In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import csv
import os
import pandas as pd

In [4]:
# Paths to folders containing English and Italian text files
tsvfile = '/content/drive/MyDrive/Tico19_NMT_LLM/TICO19_En-It.tsv'
test_set_en = '/content/drive/MyDrive/Tico19_NMT_LLM/Sets/test_en.txt'
test_set_it= '/content/drive/MyDrive/Tico19_NMT_LLM/Sets/test_it.txt'
tuning = '/content/drive/MyDrive/Tico19_NMT_LLM/Sets/tuning.txt'
training_en = '/content/drive/MyDrive/Tico19_NMT_LLM/Sets/training_en.txt'
training_it ='/content/drive/MyDrive/Tico19_NMT_LLM/Sets/training_it.txt'
sets_loc = '/content/drive/MyDrive/Tico19_NMT_LLM/Sets'

In [5]:
import csv

def process_tsv(input_tsv, output_dir):
    # Initialize lists to store rows
    every_third = []
    every_fourth = []
    leftovers = []

    # Read the input TSV file
    with open(input_tsv, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile, delimiter='\t')
        rows = list(reader)

    # Extract every third sentence pair (up to 100)
    for i in range(0, len(rows), 3):
        if len(every_third) < 100:
            every_third.append(rows[i])
        else:
            leftovers.append(rows[i])

    # Extract every fourth sentence from the remaining rows (up to 50)
    remaining_rows = [row for row in rows if row not in every_third]
    for i in range(0, len(remaining_rows), 4):
        if len(every_fourth) < 50:
            every_fourth.append(remaining_rows[i])
        else:
            leftovers.append(remaining_rows[i])

    # Extract all leftovers that were not touched
    untouched_rows = [row for row in remaining_rows if row not in every_fourth]

    # Write the first 100 sentence pairs to text files
    with open(test_set_en, 'w', encoding='utf-8') as src_100, open(test_set_it, 'w', encoding='utf-8') as tgt_100:
        for row in every_third:
            src_100.write(row['sourceString'] + '\n')
            tgt_100.write(row['targetString'] + '\n')

    # Write the 50 additional sentences to a text file
    with open(tuning, 'w', encoding='utf-8') as src_50:
        for row in every_fourth:
            src_50.write(row['sourceString'] + '\n')

    # Write leftover sentences to text files
    with open(training_en, 'w', encoding='utf-8') as src_leftover, open(training_it, 'w', encoding='utf-8') as tgt_leftover:
        for row in untouched_rows:
            src_leftover.write(row['sourceString'] + '\n')
            tgt_leftover.write(row['targetString'] + '\n')

# Example usage:
process_tsv(tsvfile, sets_loc)

In [6]:
def check_shared_sentences(file1, file2, file3=None):
    # Read sentences from the first file
    with open(file1, 'r', encoding='utf-8') as f1:
        sentences_file1 = set(f1.readlines())

    # Read sentences from the second file
    with open(file2, 'r', encoding='utf-8') as f2:
        sentences_file2 = set(f2.readlines())

    # Read sentences from the third file, if provided
    if file3:
        with open(file3, 'r', encoding='utf-8') as f3:
            sentences_file3 = set(f3.readlines())
    else:
        sentences_file3 = set()

    # Find shared sentences between the files
    shared_file1_file2 = sentences_file1.intersection(sentences_file2)
    shared_file1_file3 = sentences_file1.intersection(sentences_file3)
    shared_file2_file3 = sentences_file2.intersection(sentences_file3)

    # Display shared items, if any
    if shared_file1_file2:
        print("Shared sentences between file 1 and file 2:")
        for sentence in sorted(shared_file1_file2):
            print(sentence.strip())

    if shared_file1_file3:
        print("\nShared sentences between file 1 and file 3:")
        for sentence in sorted(shared_file1_file3):
            print(sentence.strip())

    if shared_file2_file3:
        print("\nShared sentences between file 2 and file 3:")
        for sentence in sorted(shared_file2_file3):
            print(sentence.strip())

    if not (shared_file1_file2 or shared_file1_file3 or shared_file2_file3):
        print("No shared sentences found.")

# Example usage:
# Files in English
file1 = test_set_en
file2 = tuning
file3 = training_en


check_shared_sentences(file1, file2, file3)


No shared sentences found.


In [7]:
def verify_sentence_coverage(tsvfile, test_set_en, tuning, training_en):
    # Read all sentences from the CSV's sourceString column
    with open(tsvfile, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile, delimiter=',')
        source_sentences = set(row['sourceString'] for row in reader)

    # Read all sentences from the three txt files
    def read_sentences(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return set(f.readlines())

    top_100_sentences = read_sentences(test_set_en)
    one_in_four_sentences = read_sentences(tuning)
    leftover_sentences = read_sentences(training_en)

    # Combine sentences from all three files
    combined_sentences = top_100_sentences.union(one_in_four_sentences).union(leftover_sentences)

    # Strip whitespace for consistent matching
    combined_sentences = {sentence.strip() for sentence in combined_sentences}
    source_sentences = {sentence.strip() for sentence in source_sentences}

    # Find missing sentences
    missing_sentences = source_sentences - combined_sentences

    # Output results
    if missing_sentences:
        print("The following sentences from sourceString are missing:")
        for sentence in sorted(missing_sentences):
            print(sentence)
    else:
        print("All sentences from sourceString are covered in the text files.")

verify_sentence_coverage(tsvfile, test_set_en, tuning, training_en)


KeyError: 'sourceString'

In [13]:
def check_sentence_length(file_paths, word_limit=80):
    """
    Checks if there are sentences longer than a specified word limit in the given files.

    Args:
        file_paths (list): List of file paths to check.
        word_limit (int): Maximum allowed word count for a sentence. Default is 80.

    Returns:
        dict: A dictionary with file paths as keys and lists of long sentences as values.
    """
    long_sentences = {}

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            sentences = file.readlines()
            # Find sentences exceeding the word limit
            long_sentences[file_path] = [sentence.strip() for sentence in sentences if len(sentence.split()) > word_limit]

    return long_sentences


# Example usage:
file_paths = [test_set_en, test_set_it, tuning, training_en, training_it]

long_sentences = check_sentence_length(file_paths)

# Print results
for file, sentences in long_sentences.items():
    if sentences:
        print(f"Sentences longer than {60} words found in {file}:")
        for sentence in sentences:
            print(f"- {sentence}")
    else:
        print(f"No sentences longer than {60} words found in {file}.")


No sentences longer than 60 words found in /content/drive/MyDrive/Tico19_NMT_LLM/Sets/test_en.txt.
No sentences longer than 60 words found in /content/drive/MyDrive/Tico19_NMT_LLM/Sets/test_it.txt.
No sentences longer than 60 words found in /content/drive/MyDrive/Tico19_NMT_LLM/Sets/tuning.txt.
No sentences longer than 60 words found in /content/drive/MyDrive/Tico19_NMT_LLM/Sets/training_en.txt.
No sentences longer than 60 words found in /content/drive/MyDrive/Tico19_NMT_LLM/Sets/training_it.txt.
