# Prepare Test set to be benchmarked across different models 

In [46]:
!pip install deepl

Collecting deepl
  Downloading deepl-1.17.0-py3-none-any.whl.metadata (26 kB)
Downloading deepl-1.17.0-py3-none-any.whl (35 kB)
Installing collected packages: deepl
Successfully installed deepl-1.17.0


## Preprocess ASPEC to same format as other benchmarks 

In [21]:
import os

def split_aspec_data(file_path):
    # Get the directory of the input file
    input_dir = os.path.dirname(file_path)

    # Get the base name of the input file (without extension)
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Create the output file paths in the same directory as the input file
    engl_output_path = os.path.join(input_dir, f"{base_name}.raw.en")
    japn_output_path = os.path.join(input_dir, f"{base_name}.raw.ja")

    # Open the input file for reading
    with open(file_path, 'r', encoding='utf-8') as file:
        # Open the output files for writing
        with open(engl_output_path, 'w', encoding='utf-8') as engl_file, open(japn_output_path, 'w', encoding='utf-8') as japn_file:
            # Iterate over each line in the input file
            for line in file:
                # Split the line into sections using the delimiter " ||| "
                sections = line.strip().split(' ||| ')

                # Extract the Japanese and English sections (last two sections)
                japanese_text = sections[-2]
                english_text = sections[-1]

                # Write the Japanese text to the Japanese output file
                japn_file.write(japanese_text + '\n')

                # Write the English text to the English output file
                engl_file.write(english_text + '\n')

    print(f"Data split completed. Output files: {engl_output_path} and {japn_output_path}")

In [23]:
split_aspec_data('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.txt')
split_aspec_data('./datasets/private/ASPEC/ASPEC-JE/test/test.txt')

Data split completed. Output files: ./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.en and ./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.ja
Data split completed. Output files: ./datasets/private/ASPEC/ASPEC-JE/test/test.raw.en and ./datasets/private/ASPEC/ASPEC-JE/test/test.raw.ja


In [None]:
import os
import subprocess

def tokenize_file(file_path):
    # Get the directory of the input file
    input_dir = os.path.dirname(file_path)

    # Get the base name and extension of the input file
    base_name, ext = os.path.splitext(os.path.basename(file_path))

    # Create the output file path in the same directory as the input file
    output_path = os.path.join(input_dir, f"{base_name}.tok{ext}")

    # Run KyTea on the input file and generate the tokenized output
    subprocess.run(['kytea', '-notags', '-wsconst', 'D', '&lt', '-out', 'tok', '<', file_path, '>', output_path], check=True)

    print(f"Tokenization completed. Output file: {output_path}")

In [42]:
# NOTE this function was really slow --> seconds become minutes, just did it in terminal instead kept it fast 
#tokenize_file('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.en')
#tokenize_file('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.raw.ja')

## Create massive dataframe with testing data 

In [9]:
import random
import pandas as pd
random.seed(0) 

def create_parallel_dataframe(engl_path, japn_path, existing_df=None):
    # Read the English and Japanese files
    with open(engl_path, 'r', encoding='utf-8') as engl_file, open(japn_path, 'r', encoding='utf-8') as japn_file:
        engl_lines = engl_file.readlines()
        japn_lines = japn_file.readlines()

    # Get the total number of lines in each file
    total_lines = min(len(engl_lines), len(japn_lines))

    # Check if there are at least 300 lines in both files
    if total_lines < 300:
        raise ValueError("Files must contain at least 300 lines.")

    # Generate a random starting index for the interval
    start_index = random.randint(0, total_lines - 300)

    # Extract the random interval of 300 lines from each file
    engl_subset = engl_lines[start_index:start_index + 300]
    japn_subset = japn_lines[start_index:start_index + 300]

    # Create a new DataFrame with the extracted lines
    new_data = {'English': engl_subset, 'Japanese': japn_subset}
    new_df = pd.DataFrame(new_data)

    # If an existing DataFrame is provided, concatenate the new data with it
    if existing_df is not None:
        df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        df = new_df

    return df, start_index

In [14]:
kftt_out, kftt_start_index = create_parallel_dataframe('./datasets/public/kftt-data-1.0/data/tok/kyoto-test.en', './datasets/public/kftt-data-1.0/data/tok/kyoto-test.ja') 
kftt_and_phemt, phemt_start_index =  create_parallel_dataframe('./datasets/public/pheMT_final/tok.en', './datasets/public/pheMT_final/tok.ja', kftt_out)
kftt_phemt_aspec, aspec_start_index = create_parallel_dataframe('./datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.en', './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.ja', kftt_and_phemt)
print(kftt_phemt_aspec, kftt_start_index, phemt_start_index, aspec_start_index)

                                               English  \
0    Jakugen developed a school later , so there we...   
1      Subsequently , Shukai restored Ohara Shomyo .\n   
2    Tanchi established a stream based on a new for...   
3    Since then , it became the center of Tendai Sh...   
4    The Yuzunembutsu-shu sect , Jodo-shu sect and ...   
..                                                 ...   
895  Lignin   cresol   extracted   by   acetone   i...   
896  A  fiber  mold   u sing   recycled   paper   i...   
897  As  a   result ,  a  composite  with   high   ...   
898  It   can   be separated into  compound  compos...   
899  This  material   reports  the   result   of   ...   

                                              Japanese  
0    また 、 後 に 寂原 が 一派 を な し て 、 大原 に は 2 派 の 系統 の 声...  
1                  のち に 宗快 が 大原 声明 を 再興 する に 至 っ た 。\n  
2             湛智 が 新し い 音楽 理論 に 基づ い た 流れ を 構築 し た 。\n  
3    以降 、 天台 声明 の 中枢 を なし 、 現在 の 天台 声明 に 継承 さ れ て い...  
4    融通 念仏 宗 、 浄土 

# Do some checks on max characters so i dont go over free api limits and go bankrupt 

In [27]:
def count_total_characters(df):
    # Count the total number of characters in the Japanese column
    total_japanese_chars = df['Japanese'].str.len().sum()

    # Count the total number of characters in the English column
    total_english_chars = df['English'].str.len().sum()

    # Create a dictionary with the counts
    char_counts = {
        "japanese_characters": total_japanese_chars,
        "english_characters": total_english_chars
    }
    print(char_counts) 
    return char_counts

In [28]:
tot_chars = count_total_characters(kftt_phemt_aspec.copy())
total = tot_chars['japanese_characters'] + tot_chars['english_characters']
print(f"Total characters going to API calls = {total}")
assert total < 500000

{'japanese_characters': 56690, 'english_characters': 128381}
Total characters going to API calls = 185071


# API tests 

## deepL

In [5]:
import deepl

def translate_and_save(df, api_key, src_language, output_file):
    # src_langauge = 'English' or 'Japanese' 
    try:
        # Initialize the DeepL translator
        translator = deepl.Translator(api_key)

        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows():
                source_text = row[src_language]
                target_language = 'EN-US' if src_language== 'Japanese' else 'JA'

                # Translate the Japanese text to English
                result = translator.translate_text(source_text, target_lang=target_language)
                translated_text = result.text

                # Write the translated text to the output file
                file.write(translated_text + '\n')

        print(f"Translation completed. Translations saved to: {output_file}")

    except deepl.exceptions.AuthorizationException:
        print("Invalid API key. Please check your API key.")

    except deepl.exceptions.QuotaExceededException:
        print("Quota exceeded. Please check your DeepL usage limits.")

    except deepl.exceptions.DeepLException as e:
        print(f"An error occurred during translation: {str(e)}")

    except IOError:
        print(f"Error writing to file: {output_file}")

In [7]:
def get_api_key(file_path):
    try:
        with open(file_path, 'r') as file:
            api_key = file.readline().strip()
            return api_key
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except IOError:
        print(f"Error reading file: {file_path}")
        return None

In [20]:
def save_column(df, column, output_file):
    try:
        # Open the output file in write mode
        with open(output_file, 'w', encoding='utf-8') as file:
            # Iterate over each row in the DataFrame
            for _, row in df.iterrows():
                japanese_text = row[column]

                # Write the Japanese text to the output file
                file.write(japanese_text + '\n')

        print(f"{column} saved to: {output_file}")

    except IOError:
        print(f"Error writing to file: {output_file}")

In [31]:
import json

def write_json_file(path, kftt_start_index, phe_mt_start_index, aspec_start_index):
    # Create a dictionary with the provided indexes
    data = {
        "kftt_start_index": kftt_start_index,
        "phe_mt_start_index": phe_mt_start_index,
        "aspec_start_index": aspec_start_index
    }
    
    # Write the dictionary to a file in JSON format
    with open(path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

In [40]:
# Save in and out truth values for jp->en 
# save_column(kftt_phemt_aspec, 'Japanese', 'model_outputs/test/jp_to_en/in.txt')
# save_column(kftt_phemt_aspec, 'English', 'model_outputs/test/jp_to_en/out.txt')
# write_json_file('model_outputs/test/jp_to_en/index.json', kftt_start_index, phemt_start_index, aspec_start_index)
# Save in and out truth values for en->jp 
# save_column(kftt_phemt_aspec, 'English', 'model_outputs/test/en_to_jp/in.txt')
# save_column(kftt_phemt_aspec, 'Japanese', 'model_outputs/test/en_to_jp/out.txt')
# write_json_file('model_outputs/test/en_to_jp/index.json', kftt_start_index, phemt_start_index, aspec_start_index)



In [9]:
# NOTE: i ran this, its commented out so i dont rerun and go over api limit 
#api_key = get_api_key('./datasets/private/deepl_apikey.txt')
# translated_english = translate_and_save(kftt_phemt_aspec, api_key, 'Japanese', 'model_outputs/test/jp_to_en/deepL/out.txt')
# print("Original Japanese:", kftt_phemt_aspec['Japanese'].iloc[0])
# print("Translated English:", translated_english)

In [38]:
# NOTE: i ran this, its commented out so i dont rerun and go over api limit 
# api_key = get_api_key('./datasets/private/apikey.txt') 
# print(api_key) 
# print(kftt_phemt_aspec) 
# translated_japanese = translate_and_save(kftt_phemt_aspec, api_key, 'English', 'model_outputs/test/en_to_jp/deepL/out.txt') 
# print("Original English:", kftt_phemt_aspec['English'].iloc[0])
# print("Translated Japanese:", translated_japanese)
 

## Evaluation Tools 

In [77]:
!pip install --upgrade sacrebleu==2.0.0

Collecting sacrebleu==2.0.0
  Obtaining dependency information for sacrebleu==2.0.0 from https://files.pythonhosted.org/packages/fa/63/b3c11f951eafa2dc296862431f29fb12dbe191cb72217cf88ed04c32086b/sacrebleu-2.0.0-py3-none-any.whl.metadata
  Using cached sacrebleu-2.0.0-py3-none-any.whl.metadata (52 kB)
Using cached sacrebleu-2.0.0-py3-none-any.whl (90 kB)
Installing collected packages: sacrebleu
Successfully installed sacrebleu-2.0.0


In [21]:
pip install "sacrebleu[ja]"

Collecting mecab-python3<=1.0.6,>=1.0.5 (from sacrebleu[ja])
  Downloading mecab-python3-1.0.6.tar.gz (77 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.7/77.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ipadic<2.0,>=1.0 (from sacrebleu[ja])
  Using cached ipadic-1.0.0-py3-none-any.whl
Building wheels for collected packages: mecab-python3
  Building wheel for mecab-python3 (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[21 lines of output][0m
  [31m   [0m /Users/thomaspett/Desktop/pytorch-test/env/lib/python3.10/site-packages/setuptools/__init__.py:80: _DeprecatedInstaller: setuptools.installer and fetch_build_eggs are deprecated.
  [31m   [0m !!
  [31m   [0m 
  [31m   [0m         *******************

In [3]:
from sacrebleu.metrics import BLEU, CHRF 
def evaluate_translation(input_file, translated_file, reference_file):
    # Function to read a file and extract non-blank lines
    def read_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = [line.strip() for line in file if line.strip()]
        return lines

    # Read the files
    input_lines = read_file(input_file)
    translated_lines = read_file(translated_file)
    reference_lines = [read_file(reference_file)]  # Note the list wrapping for multiple references support
    print(translated_lines[1]) # TODO remove
    print(reference_lines[0][1]) # TODO remove

    # Initialize the BLEU object
    bleu = BLEU()
    chrf = CHRF() 

    # Compute the BLEU score
    score = bleu.corpus_score(translated_lines, reference_lines)
    score2 = chrf.corpus_score(translated_lines, reference_lines)

    # Print and return the BLEU score and its detailed breakdown
    print(f"Bleu Score: {score.score}")
    print(f"CHRF Score: {score2.score}")
    print(f"Full report 1: {score}")
    print(f"Full report 2: {score2}")
    return (score.score, score2.score) 

In [4]:
evaluate_translation(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/deepL/out.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
evaluate_translation(input_file='model_outputs/test/en_to_jp/in.txt', translated_file='model_outputs/test/en_to_jp/deepL/out.txt', reference_file='model_outputs/test/en_to_jp/out.txt')

Later, Soukai revived the Ohara-voice.
Subsequently , Shukai restored Ohara Shomyo .
Bleu Score: 17.431383539831966
CHRF Score: 48.86399062729437
Full report 1: BLEU = 17.43 52.3/24.3/13.2/7.7 (BP = 0.920 ratio = 0.923 hyp_len = 18989 ref_len = 20572)
Full report 2: chrF2 = 48.86
その後、秀海は大原照明を復活させた。
のち に 宗快 が 大原 声明 を 再興 する に 至 っ た 。
Bleu Score: 0.0001015339034000896
CHRF Score: 35.04544963210478
Full report 1: BLEU = 0.00 12.0/10.2/3.7/3.3 (BP = 0.000 ratio = 0.083 hyp_len = 1844 ref_len = 22163)
Full report 2: chrF2 = 35.05


0.0001015339034000896

## Google Translate 

In [39]:
from google.cloud import translate_v2
import pandas as pd

def google_translate_and_save(df, project, api_key, src_language, output_file):
    # src_language = 'English' or 'Japanese'
    try:
        # Initialize the Google Translate client
        #project=project, credentials=api_key TODO remove 
        client = translate_v2.Client(client_options={"api_key": api_key})
        row = df.iloc[0] # TODO REMOVE 
        src_text = row[src_language] # TODO REMOVE
        target_language = 'en-US' if src_language == 'Japanese' else 'ja-JP' # TODO REMOVE 
        print(client.translate(src_text, target_language=target_language)) # TODO REMOVE  
        # Open the output file in write mode
        # with open(output_file, 'w', encoding='utf-8') as file:
        #     # Iterate over each row in the DataFrame
        #     for _, row in df.iterrows():
        #         source_text = row[src_language]
        #         target_language = 'en-US' if src_language == 'Japanese' else 'ja-JP'

        #         # Translate the text
        #         result = client.translate(source_text, target_language=target_language)
        #         translated_text = result['translatedText']

        #         # Write the translated text to the output file
        #         file.write(translated_text + '\n')

        # print(f"Translation completed. Translations saved to: {output_file}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [6]:
import json 
def get_google_project_and_credentials(file_path):
    try:
        # Open the JSON file for reading
        with open(file_path, 'r') as file:
            # Load data from JSON file into a Python dictionary
            data = json.load(file)
        
        # Return the dictionary containing the API key and project information
        return data

    except FileNotFoundError:
        print(f"Error: The file at {file_path} does not exist.")
    except json.JSONDecodeError:
        print("Error: The file is not a valid JSON.")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")


In [18]:
import pandas as pd
import json

def reassemble_test_dataset(json_path, engl_paths, japn_paths):
    """
    Reassembles the dataset from the original files using indices stored in a JSON file.

    Args:
    json_path (str): Path to the JSON file containing the indices.
    engl_paths (list): List of paths to the English files in the order they were originally concatenated.
    japn_paths (list): List of paths to the Japanese files in the order they were originally concatenated.

    Returns:
    pd.DataFrame: The reassembled DataFrame.
    """
    # Read the indices from the JSON file
    with open(json_path, 'r', encoding='utf-8') as json_file:
        indices = json.load(json_file)
    
    # Initialize an empty DataFrame
    df = pd.DataFrame()

    # Extract and concatenate the subsets using the indices
    for engl_path, japn_path, index_key in zip(engl_paths, japn_paths, indices):
        with open(engl_path, 'r', encoding='utf-8') as engl_file, open(japn_path, 'r', encoding='utf-8') as japn_file:
            engl_lines = engl_file.readlines()
            japn_lines = japn_file.readlines()
        
        # Extract the subset of lines using the index
        start_index = indices[index_key]
        engl_subset = engl_lines[start_index:start_index + 300]
        japn_subset = japn_lines[start_index:start_index + 300]

        # Create a new DataFrame with the extracted lines
        new_data = {'English': engl_subset, 'Japanese': japn_subset}
        new_df = pd.DataFrame(new_data)

        # Concatenate the new DataFrame with the existing one
        df = pd.concat([df, new_df], ignore_index=True)
    
    return df

## How to reassemble the previous test set in-case kernel shutdown from here on out

In [24]:
config = get_google_project_and_credentials('datasets/private/google_translate_apikey.json')
json_path = './model_outputs/test/en_to_jp/index.json'
engl_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.en',
    './datasets/public/pheMT_final/tok.en',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.en'
]
japn_paths = [
    './datasets/public/kftt-data-1.0/data/tok/kyoto-test.ja',
    './datasets/public/pheMT_final/tok.ja',
    './datasets/private/ASPEC/ASPEC-JE/devtest/devtest.tok.ja'
]
kftt_phemt_aspec = reassemble_test_dataset(json_path, engl_paths, japn_paths)

In [41]:
import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'datasets/private/gleaming-glass-421121-19561f1764f8.json'
google_translate_and_save(kftt_phemt_aspec, config['project'], config['api_key'], 'Japanese', '') 

{'translatedText': 'Later, Jakuhara formed a sect, and so there were two schools of Shomyo in Ohara.\n', 'detectedSourceLanguage': 'ja', 'input': 'また 、 後 に 寂原 が 一派 を な し て 、 大原 に は 2 派 の 系統 の 声明 が あ っ た 。\n'}
