# Prepare Test set to be benchmarked across different models 

In [46]:
!pip install deepl

Collecting deepl
  Downloading deepl-1.17.0-py3-none-any.whl.metadata (26 kB)
Downloading deepl-1.17.0-py3-none-any.whl (35 kB)
Installing collected packages: deepl
Successfully installed deepl-1.17.0


## Preprocess ASPEC to same format as other benchmarks 

In [21]:
import os

def split_aspec_data(file_path):
    # Get the directory of the input file
    input_dir = os.path.dirname(file_path)

    # Get the base name of the input file (without extension)
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Create the output file paths in the same directory as the input file
    engl_output_path = os.path.join(input_dir, f"{base_name}.raw.en")
    japn_output_path = os.path.join(input_dir, f"{base_name}.raw.ja")

    # Open the input file for reading
    with open(file_path, 'r', encoding='utf-8') as file:
        # Open the output files for writing
        with open(engl_output_path, 'w', encoding='utf-8') as engl_file, open(japn_output_path, 'w', encoding='utf-8') as japn_file:
            # Iterate over each line in the input file
            for line in file:
                # Split the line into sections using the delimiter " ||| "
                sections = line.strip().split(' ||| ')

                # Extract the Japanese and English sections (last two sections)
                japanese_text = sections[-2]
                english_text = sections[-1]

                # Write the Japanese text to the Japanese output file
                japn_file.write(japanese_text + '\n')

                # Write the English text to the English output file
                engl_file.write(english_text + '\n')

    print(f"Data split completed. Output files: {engl_output_path} and {japn_output_path}")

In [23]:
split_aspec_data('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.txt')
split_aspec_data('./datasets/private/ASPEC/ASPEC-JE/test/test.txt')

Data split completed. Output files: ./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.en and ./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.ja
Data split completed. Output files: ./datasets/private/ASPEC/ASPEC-JE/test/test.raw.en and ./datasets/private/ASPEC/ASPEC-JE/test/test.raw.ja


In [None]:
import os
import subprocess

def tokenize_file(file_path):
    # Get the directory of the input file
    input_dir = os.path.dirname(file_path)

    # Get the base name and extension of the input file
    base_name, ext = os.path.splitext(os.path.basename(file_path))

    # Create the output file path in the same directory as the input file
    output_path = os.path.join(input_dir, f"{base_name}.tok{ext}")

    # Run KyTea on the input file and generate the tokenized output
    subprocess.run(['kytea', '-notags', '-wsconst', 'D', '&lt', '-out', 'tok', '<', file_path, '>', output_path], check=True)

    print(f"Tokenization completed. Output file: {output_path}")

In [42]:
# NOTE this function was really slow --> seconds become minutes, just did it in terminal instead kept it fast 
#tokenize_file('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.en')
#tokenize_file('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.ja')

## Create massive dataframe with testing data 

In [12]:
import random
import pandas as pd

def create_parallel_dataframe(engl_path, japn_path, existing_df=None):
    # Read the English and Japanese files
    with open(engl_path, 'r', encoding='utf-8') as engl_file, open(japn_path, 'r', encoding='utf-8') as japn_file:
        engl_lines = engl_file.readlines()
        japn_lines = japn_file.readlines()

    # Get the total number of lines in each file
    total_lines = min(len(engl_lines), len(japn_lines))

    # Check if there are at least 300 lines in both files
    if total_lines < 300:
        raise ValueError("Files must contain at least 300 lines.")

    # Generate a random starting index for the interval
    start_index = random.randint(0, total_lines - 300)

    # Extract the random interval of 300 lines from each file
    engl_subset = engl_lines[start_index:start_index + 300]
    japn_subset = japn_lines[start_index:start_index + 300]

    # Create a new DataFrame with the extracted lines
    new_data = {'English': engl_subset, 'Japanese': japn_subset}
    new_df = pd.DataFrame(new_data)

    # If an existing DataFrame is provided, concatenate the new data with it
    if existing_df is not None:
        df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        df = new_df

    return df, start_index

In [45]:
kftt_out, kftt_start_index = create_parallel_dataframe('./datasets/public/kftt-data-1.0/data/tok/kyoto-test.en', './datasets/public/kftt-data-1.0/data/tok/kyoto-test.ja') 
kftt_and_phemt, phemt_start_index =  create_parallel_dataframe('./datasets/public/pheMT_final/tok.en', './datasets/public/pheMT_final/tok.ja', kftt_out)
kftt_phemt_aspec, aspec_start_index = create_parallel_dataframe('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.en', './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.ja', kftt_and_phemt)
print(kftt_phemt_aspec, kftt_start_index, phemt_start_index, aspec_start_index)

                                               English  \
0                Volume 3 consists of 102 chapters .\n   
1                 Volume 4 consists of 32 chapters .\n   
2                    Volume 5 might have been lost .\n   
3                             It takes Ruisan form .\n   
4    There exists only one manuscript that has been...   
..                                                 ...   
895  Capacity   as   human   and  abundant   experi...   
896  In  the  future ,   conferences  are  improved...   
897  Serveillance   of   11   cases  receiving   vi...   
898    Most   of  nursing   persons   are  spouses .\n   
899  No   bedsore   was  observed  in  the   patien...   

                                              Japanese  
0                                        3 巻 102 話 。\n  
1                                         4 話 32 話 。\n  
2                                           5 巻 紛失 ?\n  
3                                      類纂 形態 を と る 。\n  
4          加賀 国 、 

# Do some checks on max characters so i dont go over free api limits and go bankrupt 

In [75]:
def count_total_characters(df):
    # Count the total number of characters in the Japanese column
    total_japanese_chars = df['Japanese'].str.len().sum()

    # Count the total number of characters in the English column
    total_english_chars = df['English'].str.len().sum()

    # Create a dictionary with the counts
    char_counts = {
        "japanese_characters": total_japanese_chars,
        "english_characters": total_english_chars
    }
    print(char_counts) 
    return char_counts

In [77]:
tot_chars = count_total_characters(kftt_phemt_aspec.copy())
total = tot_chars['japanese_characters'] + tot_chars['english_characters']
print(f"Total characters going to API calls = {total}")
assert total < 500000

{'japanese_characters': 59923, 'english_characters': 130927}
Total characters going to API calls = 190850


# API tests 

## deepL

In [87]:
import deepl

def translate_and_save(df, api_key, output_file):
    try:
        # Initialize the DeepL translator
        translator = deepl.Translator(api_key)

        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows():
                japanese_text = row['Japanese']

                # Translate the Japanese text to English
                result = translator.translate_text(japanese_text, target_lang="EN-US")
                translated_text = result.text

                # Write the translated text to the output file
                file.write(translated_text + '\n')

        print(f"Translation completed. Translations saved to: {output_file}")

    except deepl.exceptions.AuthorizationException:
        print("Invalid API key. Please check your API key.")

    except deepl.exceptions.QuotaExceededException:
        print("Quota exceeded. Please check your DeepL usage limits.")

    except deepl.exceptions.DeepLException as e:
        print(f"An error occurred during translation: {str(e)}")

    except IOError:
        print(f"Error writing to file: {output_file}")

In [81]:
def get_api_key(file_path):
    try:
        with open(file_path, 'r') as file:
            api_key = file.readline().strip()
            return api_key
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except IOError:
        print(f"Error reading file: {file_path}")
        return None

In [99]:
def save_column(df, column, output_file):
    try:
        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows():
                japanese_text = row[column]

                # Write the Japanese text to the output file
                file.write(japanese_text + '\n')

        print(f"{column} saved to: {output_file}")

    except IOError:
        print(f"Error writing to file: {output_file}")

In [100]:
# NOTE: i ran this, its commented out so i dont rerun and go over api limit 
#api_key = get_api_key('./datasets/private/apikey.txt')
#save_column(kftt_phemt_aspec, 'Japanese', 'model_outputs/test/jp_to_en/in.txt')
save_column(kftt_phemt_aspec, 'English', 'model_outputs/test/jp_to_en/out.txt')
#translated_english = translate_and_save(kftt_phemt_aspec, api_key, 'model_outputs/test/jp_to_en/deepL/out.txt')
# print("Original Japanese:", kftt_phemt_aspec['Japanese'].iloc[0])
# print("Translated English:", translated_english)

English saved to: model_outputs/test/jp_to_en/out.txt
