# Import libraries

In [1]:
# Import libraries
import os
import pandas as pd
import json
import sacrebleu

# Set Paths

In [2]:
# Base path
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..'))

# Model Parameters
unsloth_model_name = 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'
company_name = 'meta'
model_name = unsloth_model_name.split('/')[1]

# Results path
results_path_file = os.path.join(base_path, 'results', company_name, model_name, 'merged_results.json')

# Print paths
print('Base path:', base_path)
print('Results path:', results_path_file)

Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Results path: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Meta-Llama-3.1-8B-Instruct-bnb-4bit/merged_results.json


# Set Evaluation Parameters

In [5]:
# Choose source language
source_language = "Early Modern Bohemian German"
target_language = "English" if source_language == "Early Modern Bohemian German" else "Early Modern Bohemian German"

# Evaluate finetuning and icl?
evaluate_icl = False
evaluate_finetuning = True

# Median splitting with regard to word count?
median_splitting = False
# Translation direction is defined depending on the chosen source languag 
translation_direction = "DE_to_EN" if source_language == "Early Modern Bohemian German" else "EN_to_DE"

# Print
print(f'Source Language: {source_language}')
print(f'Target Language: {target_language}')
print(f'Translation Direction: {translation_direction}')
print(f'Evaluate ICL: {evaluate_icl}')
print(f'Evaluate finetuning: {evaluate_finetuning}')
print(f'Median Splitting? {median_splitting}')

Source Language: Early Modern Bohemian German
Target Language: English
Translation Direction: DE_to_EN
Evaluate ICL: False
Evaluate finetuning: True
Median Splitting? False


# Load Results File

In [6]:
# Load the JSON file
with open(results_path_file, 'r') as f:
    data = [json.loads(line) for line in f]

# Create a DataFrame based on the JSON file
merged_df = pd.DataFrame(data)

# Check Columns

In [7]:
# Check column names
merged_df.columns

Index(['Early Modern Bohemian German', 'English', 'DE_to_EN_finetuning',
       'EN_to_DE_finetuning'],
      dtype='object')

# Construct Candidate Columns

In [8]:
# Define a function to generate candidate columns based on the conditions
def construct_candidate_columns(translation_direction, evaluate_icl, evaluate_finetuning):
    shots = ['000', '001', '002', '004', '008', '016', '032', '064', '128']
    
    # Add shots columns
    if evaluate_icl:
        if translation_direction == 'DE_to_EN':
            candidate_columns = [f'DE_to_EN_{shot}_example_prompt' for shot in shots]
        elif translation_direction == 'EN_to_DE':
            candidate_columns = [f'EN_to_DE_{shot}_example_prompt' for shot in shots]
    
    # Add finetuning column
    if evaluate_finetuning:
        if evaluate_icl:
            candidate_columns.append(f'{translation_direction}_finetuning')
        else:
            candidate_columns = [f'{translation_direction}_finetuning']
    
    return candidate_columns

# You can now concatenate or use these columns as needed
candidate_columns = construct_candidate_columns(translation_direction, evaluate_icl, evaluate_finetuning)
candidate_columns

['DE_to_EN_finetuning']

# Median Splitting

In [9]:
# Compute word count for the source_language column
source_language_word_count = f'{source_language}_word_count'
merged_df[source_language_word_count] = merged_df[source_language].apply(lambda x: len(str(x).split()))

# Compute the median word count
median_word_count = merged_df[source_language_word_count].median()

# Create merged_df_short and merged_df_long based on the median
merged_df_short = merged_df[merged_df[source_language_word_count] <= median_word_count]
merged_df_long = merged_df[merged_df[source_language_word_count] > median_word_count]

#vPrint the shapes of the datasets
print("Shape of merged_df_short:", merged_df_short.shape)
print("Shape of merged_df_long:", merged_df_long.shape)
print(f'Median word count in {source_language}: {median_word_count}')

Shape of merged_df_short: (501, 5)
Shape of merged_df_long: (499, 5)
Median word count in Early Modern Bohemian German: 73.0


In [10]:
merged_df_short.head()

Unnamed: 0,Early Modern Bohemian German,English,DE_to_EN_finetuning,EN_to_DE_finetuning,Early Modern Bohemian German_word_count
0,1. Ein Ambt befehlich an h. Joachim vom Eberha...,1. An manorial court command is to be made to ...,1. A manorial court's command to lord Joachim ...,1. Ein Ambtsbefehl an herrn Joachim Vom Eberha...,45
4,Ao 1661. Martius. Mildenaw. Christoph blumbrig...,1661. march. Mildenaw. Christoph Blumbrig's co...,In 1661 March. Mildenau. Christoph Blumbrig's ...,1661. Martij Mildenaw. Christoph blumbrig Clag...,16
5,"Schultes, Vnd Schoppen Zue Bernsdorff haben si...",16th July. Schulthess and jurymen in Bernsdorf...,village headman and jurymen of Bernsdorf have ...,16 Julij. Schultheß Vnd Schöppen Zu Bernßdorff...,12
6,1782. Ist der MildenEicher Scholtes mit dem Di...,1782. The Mildeneichen village headman stood f...,"The Mildeneichen Scholtes was summoned, along ...",1782. der Schulteß Mild. Eichen standt bey der...,23
8,3. Novembris. Matz Apelt beschweret sich Vber ...,3. Novembris. Matz Apelt complains against Jac...,"On 3rd November, Matz Apelt complains against ...",3. Novembris. Matz Apelt Claget Vber Jacob Sch...,68


In [11]:
merged_df_long.head()

Unnamed: 0,Early Modern Bohemian German,English,DE_to_EN_finetuning,EN_to_DE_finetuning,Early Modern Bohemian German_word_count
1,"1. die lehenß Vnderthanen im Winckel, sollen h...",1. The fief serfs in the Winckel shall hencefo...,1. The fief-serfs in Winckel shall henceforth ...,1. Die Lehn Vnderthanen im Winckel sollen Ihro...,109
2,1. dz Sie verschienen freytag als den 29 huig ...,"1. last Friday, the 29th, they came up to the ...","1. That they appeared last Friday, namely on t...","1. Sie seindt am Vorgangenen Freytag, 29. Sept...",419
3,Es ist Vor.12. iharen Vnnd mehr Zu Oberweigßdo...,"12 years ago and more, in Oberweigsdorf, Paul ...",It was 12 years ago in Oberweigsdorf Paul Apel...,12. Jahre Vndt mehr ist Zue Oberweigstorff Pau...,95
7,2. Ist errinnert Worden alle Undt iede alte re...,2. They were reminded to pay all arrears quick...,2. All the old restanten are to be brought in ...,"2. Ihnen alles Rests schnell zu bezahlen, vnd ...",183
10,Vor Vngefehr 14: tagen Kombt des Scholzens Zu ...,About 14 days ago the Scholz's in Bernßdorff's...,About 14 days ago the current shepherd of Bern...,Vor Vier Zehen Tag. Seindt des Scholtzen Zu Be...,360


# BLEU Evaluation

In [12]:
# Check if median splitting is true
if median_splitting:
    dataset_splits = [merged_df_short, merged_df_long]
    dataset_names = ['merged_df_short', 'merged_df_long']
else:
    dataset_splits = [merged_df]
    dataset_names = ['merged_df']

# Compute BLEU scores for both datasets
for split, name in zip(dataset_splits, dataset_names):
    # Print statement
    print('______________________________________________________________________________________________')
    print(f'Current Dataset: {name}. Source language is {source_language}.')
    print('______________________________________________________________________________________________')

    # Create references for split
    references = [split[target_language].tolist()]

    # Compute corpus-level BLEU scores
    for col in candidate_columns:
        candidates = split[col].tolist()
        print(f'Evaluation of target language {target_language} and LLM-generated candidates {col}')
        bleu_score = sacrebleu.corpus_bleu(candidates, references).score
        print(f"Corpus-level SacreBLEU score for {col}: {bleu_score:.3f}\n")

______________________________________________________________________________________________
Current Dataset: merged_df. Source language is Early Modern Bohemian German.
______________________________________________________________________________________________
Evaluation of target language English and LLM-generated candidates DE_to_EN_finetuning
Corpus-level SacreBLEU score for DE_to_EN_finetuning: 36.703



# Create Text File to Inspect Translation Results

In [13]:
# Function to generate the inspection text for a specific entry
def generate_inspection_text(df, entry_index, translation_directions, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:

        # Get the row corresponding to the entry index
        row = df.iloc[entry_index]

        # Write the German and English translations pairs by Sheilagh Ogilvie
        f.write(f"Entry {entry_index + 1}:\n")
        f.write(f"Early Modern Bohemian German (Sheilagh Ogilvie's Transcription): {row['Early Modern Bohemian German']}\n")
        f.write(f"English (Sheilagh Ogilvie): {row['English']}\n\n")

        # Write the candidate translations for each translation direction
        for col in df.columns:
            if any(direction in col for direction in translation_directions):
                f.write(f"{col}: {row[col]}\n")
        
        f.write("\n" + "="*50 + "\n\n")

# Define the translation directions and the entry to inspect
translation_directions = ['DE_to_EN', 'EN_to_DE']
entry_index = 0

# Call the function to generate the text file
output_file = 'translation_inspection_entry.txt'
generate_inspection_text(merged_df, entry_index, translation_directions, output_file)
print(f"Text file created: {output_file}")


Text file created: translation_inspection_entry.txt


In [14]:
merged_df

Unnamed: 0,Early Modern Bohemian German,English,DE_to_EN_finetuning,EN_to_DE_finetuning,Early Modern Bohemian German_word_count
0,1. Ein Ambt befehlich an h. Joachim vom Eberha...,1. An manorial court command is to be made to ...,1. A manorial court's command to lord Joachim ...,1. Ein Ambtsbefehl an herrn Joachim Vom Eberha...,45
1,"1. die lehenß Vnderthanen im Winckel, sollen h...",1. The fief serfs in the Winckel shall hencefo...,1. The fief-serfs in Winckel shall henceforth ...,1. Die Lehn Vnderthanen im Winckel sollen Ihro...,109
2,1. dz Sie verschienen freytag als den 29 huig ...,"1. last Friday, the 29th, they came up to the ...","1. That they appeared last Friday, namely on t...","1. Sie seindt am Vorgangenen Freytag, 29. Sept...",419
3,Es ist Vor.12. iharen Vnnd mehr Zu Oberweigßdo...,"12 years ago and more, in Oberweigsdorf, Paul ...",It was 12 years ago in Oberweigsdorf Paul Apel...,12. Jahre Vndt mehr ist Zue Oberweigstorff Pau...,95
4,Ao 1661. Martius. Mildenaw. Christoph blumbrig...,1661. march. Mildenaw. Christoph Blumbrig's co...,In 1661 March. Mildenau. Christoph Blumbrig's ...,1661. Martij Mildenaw. Christoph blumbrig Clag...,16
...,...,...,...,...,...
995,Schultes Zum dörffel Ist gefänglich eingeZogen...,village headman in Dörffel Was taken into impr...,The village headman of Dörffel was taken into ...,Schultes Zum dörffel Ist gefenglich eingezogen...,58
996,"Scholz Zeigt an, das hans Bischoffs Sohn Vndt ...",village headman reports that Hans Bischoff's s...,village headman reports that Hans Bischoff's s...,"Schulteß berichtet, Hans Bischoffs Sohn Vndt B...",131
997,Scholz zeigt an das Peter Pilzes 3 Söhne Zu Ku...,village headman reports that Peter Pilze's 3 s...,village headman reports that Peter Pilze's 3 s...,"Schultes berichtet, wie Peter Pilzes 3. Söhne ...",131
998,Scholtzen Zu Waigßdorff Vnd Priedlanz. Eß ist ...,village headmen in Waigßdorff and Priedlanz. I...,village headmen of Waigßdorff and Priedlanz. I...,Scholzen Zu Waigßdorff Vnnd Priedlanz. Ist auß...,80
