# Prepare Test set to be benchmarked across different models 

In [46]:
!pip install deepl

Collecting deepl
  Downloading deepl-1.17.0-py3-none-any.whl.metadata (26 kB)
Downloading deepl-1.17.0-py3-none-any.whl (35 kB)
Installing collected packages: deepl
Successfully installed deepl-1.17.0


## Preprocess ASPEC to same format as other benchmarks 

In [21]:
import os

def split_aspec_data(file_path):
    # Get the directory of the input file
    input_dir = os.path.dirname(file_path)

    # Get the base name of the input file (without extension)
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Create the output file paths in the same directory as the input file
    engl_output_path = os.path.join(input_dir, f"{base_name}.raw.en")
    japn_output_path = os.path.join(input_dir, f"{base_name}.raw.ja")

    # Open the input file for reading
    with open(file_path, 'r', encoding='utf-8') as file:
        # Open the output files for writing
        with open(engl_output_path, 'w', encoding='utf-8') as engl_file, open(japn_output_path, 'w', encoding='utf-8') as japn_file:
            # Iterate over each line in the input file
            for line in file:
                # Split the line into sections using the delimiter " ||| "
                sections = line.strip().split(' ||| ')

                # Extract the Japanese and English sections (last two sections)
                japanese_text = sections[-2]
                english_text = sections[-1]

                # Write the Japanese text to the Japanese output file
                japn_file.write(japanese_text + '\n')

                # Write the English text to the English output file
                engl_file.write(english_text + '\n')

    print(f"Data split completed. Output files: {engl_output_path} and {japn_output_path}")

In [23]:
split_aspec_data('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.txt')
split_aspec_data('./datasets/private/ASPEC/ASPEC-JE/test/test.txt')

Data split completed. Output files: ./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.en and ./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.ja
Data split completed. Output files: ./datasets/private/ASPEC/ASPEC-JE/test/test.raw.en and ./datasets/private/ASPEC/ASPEC-JE/test/test.raw.ja


In [None]:
import os
import subprocess

def tokenize_file(file_path):
    # Get the directory of the input file
    input_dir = os.path.dirname(file_path)

    # Get the base name and extension of the input file
    base_name, ext = os.path.splitext(os.path.basename(file_path))

    # Create the output file path in the same directory as the input file
    output_path = os.path.join(input_dir, f"{base_name}.tok{ext}")

    # Run KyTea on the input file and generate the tokenized output
    subprocess.run(['kytea', '-notags', '-wsconst', 'D', '&lt', '-out', 'tok', '<', file_path, '>', output_path], check=True)

    print(f"Tokenization completed. Output file: {output_path}")

In [42]:
# NOTE this function was really slow --> seconds become minutes, just did it in terminal instead kept it fast 
#tokenize_file('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.en')
#tokenize_file('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.ja')

## Create massive dataframe with testing data 

In [9]:
import random
import pandas as pd
random.seed(0) 

def create_parallel_dataframe(engl_path, japn_path, existing_df=None):
    # Read the English and Japanese files
    with open(engl_path, 'r', encoding='utf-8') as engl_file, open(japn_path, 'r', encoding='utf-8') as japn_file:
        engl_lines = engl_file.readlines()
        japn_lines = japn_file.readlines()

    # Get the total number of lines in each file
    total_lines = min(len(engl_lines), len(japn_lines))

    # Check if there are at least 300 lines in both files
    if total_lines < 300:
        raise ValueError("Files must contain at least 300 lines.")

    # Generate a random starting index for the interval
    start_index = random.randint(0, total_lines - 300)

    # Extract the random interval of 300 lines from each file
    engl_subset = engl_lines[start_index:start_index + 300]
    japn_subset = japn_lines[start_index:start_index + 300]

    # Create a new DataFrame with the extracted lines
    new_data = {'English': engl_subset, 'Japanese': japn_subset}
    new_df = pd.DataFrame(new_data)

    # If an existing DataFrame is provided, concatenate the new data with it
    if existing_df is not None:
        df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        df = new_df

    return df, start_index

In [14]:
kftt_out, kftt_start_index = create_parallel_dataframe('./datasets/public/kftt-data-1.0/data/tok/kyoto-test.en', './datasets/public/kftt-data-1.0/data/tok/kyoto-test.ja') 
kftt_and_phemt, phemt_start_index =  create_parallel_dataframe('./datasets/public/pheMT_final/tok.en', './datasets/public/pheMT_final/tok.ja', kftt_out)
kftt_phemt_aspec, aspec_start_index = create_parallel_dataframe('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.en', './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.ja', kftt_and_phemt)
print(kftt_phemt_aspec, kftt_start_index, phemt_start_index, aspec_start_index)

                                               English  \
0    Jakugen developed a school later , so there we...   
1      Subsequently , Shukai restored Ohara Shomyo .\n   
2    Tanchi established a stream based on a new for...   
3    Since then , it became the center of Tendai Sh...   
4    The Yuzunembutsu-shu sect , Jodo-shu sect and ...   
..                                                 ...   
895  Lignin   cresol   extracted   by   acetone   i...   
896  A  fiber  mold   u sing   recycled   paper   i...   
897  As  a   result ,  a  composite  with   high   ...   
898  It   can   be separated into  compound  compos...   
899  This  material   reports  the   result   of   ...   

                                              Japanese  
0    また 、 後 に 寂原 が 一派 を な し て 、 大原 に は 2 派 の 系統 の 声...  
1                  のち に 宗快 が 大原 声明 を 再興 する に 至 っ た 。\n  
2             湛智 が 新し い 音楽 理論 に 基づ い た 流れ を 構築 し た 。\n  
3    以降 、 天台 声明 の 中枢 を なし 、 現在 の 天台 声明 に 継承 さ れ て い...  
4    融通 念仏 宗 、 浄土 

# Do some checks on max characters so i dont go over free api limits and go bankrupt 

In [27]:
def count_total_characters(df):
    # Count the total number of characters in the Japanese column
    total_japanese_chars = df['Japanese'].str.len().sum()

    # Count the total number of characters in the English column
    total_english_chars = df['English'].str.len().sum()

    # Create a dictionary with the counts
    char_counts = {
        "japanese_characters": total_japanese_chars,
        "english_characters": total_english_chars
    }
    print(char_counts) 
    return char_counts

In [28]:
tot_chars = count_total_characters(kftt_phemt_aspec.copy())
total = tot_chars['japanese_characters'] + tot_chars['english_characters']
print(f"Total characters going to API calls = {total}")
assert total < 500000

{'japanese_characters': 56690, 'english_characters': 128381}
Total characters going to API calls = 185071


# API tests 

## deepL

In [18]:
import deepl

def translate_and_save(df, api_key, src_language, output_file):
    # src_langauge = 'English' or 'Japanese' 
    try:
        # Initialize the DeepL translator
        translator = deepl.Translator(api_key)

        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows():
                source_text = row[src_language]
                target_language = 'EN-US' if src_language== 'Japanese' else 'JA'

                # Translate the Japanese text to English
                result = translator.translate_text(source_text, target_lang=target_language)
                translated_text = result.text

                # Write the translated text to the output file
                file.write(translated_text + '\n')

        print(f"Translation completed. Translations saved to: {output_file}")

    except deepl.exceptions.AuthorizationException:
        print("Invalid API key. Please check your API key.")

    except deepl.exceptions.QuotaExceededException:
        print("Quota exceeded. Please check your DeepL usage limits.")

    except deepl.exceptions.DeepLException as e:
        print(f"An error occurred during translation: {str(e)}")

    except IOError:
        print(f"Error writing to file: {output_file}")

In [19]:
def get_api_key(file_path):
    try:
        with open(file_path, 'r') as file:
            api_key = file.readline().strip()
            return api_key
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except IOError:
        print(f"Error reading file: {file_path}")
        return None

In [20]:
def save_column(df, column, output_file):
    try:
        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows():
                japanese_text = row[column]

                # Write the Japanese text to the output file
                file.write(japanese_text + '\n')

        print(f"{column} saved to: {output_file}")

    except IOError:
        print(f"Error writing to file: {output_file}")

In [31]:
import json

def write_json_file(path, kftt_start_index, phe_mt_start_index, aspec_start_index):
    # Create a dictionary with the provided indexes
    data = {
        "kftt_start_index": kftt_start_index,
        "phe_mt_start_index": phe_mt_start_index,
        "aspec_start_index": aspec_start_index
    }
    
    # Write the dictionary to a file in JSON format
    with open(path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

In [40]:
# Save in and out truth values for jp->en 
# save_column(kftt_phemt_aspec, 'Japanese', 'model_outputs/test/jp_to_en/in.txt')
# save_column(kftt_phemt_aspec, 'English', 'model_outputs/test/jp_to_en/out.txt')
# write_json_file('model_outputs/test/jp_to_en/index.json', kftt_start_index, phemt_start_index, aspec_start_index)
# Save in and out truth values for en->jp 
# save_column(kftt_phemt_aspec, 'English', 'model_outputs/test/en_to_jp/in.txt')
# save_column(kftt_phemt_aspec, 'Japanese', 'model_outputs/test/en_to_jp/out.txt')
# write_json_file('model_outputs/test/en_to_jp/index.json', kftt_start_index, phemt_start_index, aspec_start_index)



In [41]:
# NOTE: i ran this, its commented out so i dont rerun and go over api limit 
api_key = get_api_key('./datasets/private/apikey.txt')
translated_english = translate_and_save(kftt_phemt_aspec, api_key, 'Japanese', 'model_outputs/test/jp_to_en/deepL/out.txt')
print("Original Japanese:", kftt_phemt_aspec['Japanese'].iloc[0])
print("Translated English:", translated_english)

Translation completed. Translations saved to: model_outputs/test/jp_to_en/deepL/out.txt
Original Japanese: また 、 後 に 寂原 が 一派 を な し て 、 大原 に は 2 派 の 系統 の 声明 が あ っ た 。

Translated English: None


In [38]:
# NOTE: i ran this, its commented out so i dont rerun and go over api limit 
# api_key = get_api_key('./datasets/private/apikey.txt') 
# print(api_key) 
# print(kftt_phemt_aspec) 
# translated_japanese = translate_and_save(kftt_phemt_aspec, api_key, 'English', 'model_outputs/test/en_to_jp/deepL/out.txt') 
# print("Original English:", kftt_phemt_aspec['English'].iloc[0])
# print("Translated Japanese:", translated_japanese)
 

## Evaluation Tools 

In [52]:
!pip install --upgrade --force-reinstall sacrebleu

Collecting sacrebleu
  Obtaining dependency information for sacrebleu from https://files.pythonhosted.org/packages/df/d5/f07d3c37bd98db883330276d77e7b04b6c50564c68fb95a76e05422a2850/sacrebleu-2.4.2-py3-none-any.whl.metadata
  Using cached sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
Collecting portalocker (from sacrebleu)
  Obtaining dependency information for portalocker from https://files.pythonhosted.org/packages/17/9e/87671efcca80ba6203811540ed1f9c0462c1609d2281d7b7f53cef05da3d/portalocker-2.8.2-py3-none-any.whl.metadata
  Using cached portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Collecting regex (from sacrebleu)
  Obtaining dependency information for regex from https://files.pythonhosted.org/packages/f1/4b/0477c6076fa63a8f3261e89c69765d5369fb70be644b6df844569970c1a7/regex-2024.4.16-cp310-cp310-macosx_11_0_arm64.whl.metadata
  Using cached regex-2024.4.16-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Obtaining dependen

In [60]:
from sacrebleu.sacrebleu.metrics import BLEU

def evaluate_translation(input_file, translated_file, reference_file):
    # Function to read a file and extract non-blank lines
    def read_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = [line.strip() for line in file if line.strip()]
        return lines

    # Read the files
    input_lines = read_file(input_file)
    translated_lines = read_file(translated_file)
    reference_lines = [read_file(reference_file)]  # Note the list wrapping for multiple references support

    # Initialize the BLEU object
    bleu = BLEU()

    # Compute the BLEU score
    score = bleu.corpus_score(translated_lines, reference_lines)

    # Print and return the BLEU score and its detailed breakdown
    print(f"Bleu Score: {score.score}")
    print(f"Full report: {score}")
    return score.score

ModuleNotFoundError: No module named 'portalocker'

In [59]:
print(evaluate_translation('model_outputs/test/jp_to_en/in.txt', 'model_outputs/test/jp_to_en/deepL/out.txt', 'model_outputs/test/jp_to_en/out.txt'))

NameError: name 'evaluate_translation' is not defined

## Google Translate 