# Import libraries

In [1]:
import os
import re
import pandas as pd
from functools import reduce

# Set Paths

In [2]:
# Base path
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..'))

# Model Parameters
unsloth_model_name = 'unsloth/Llama-3.2-3B-Instruct-bnb-4bit'
company_name = 'meta'
model_name = unsloth_model_name.split('/')[1]

In [4]:
# Results directory path
results_dir = os.path.join(base_path, 'results', company_name, model_name)

# List of shot values

shots_list = [2**i for i in range(8)] 
shots_list.insert(0, 0)
#shots_list = [] # change for finetuning

# Print paths
print(f'Base path: {base_path}')
print(f'Results directory: {results_dir}')
print(f'Shots list: {shots_list}')

Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Results directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit
Shots list: [0, 1, 2, 4, 8, 16, 32, 64, 128]


# Function to Load JSON Files into DataFrames

In [5]:
def load_json_to_df(folder_path, direction, shot):
    """
    Load JSON files into a DataFrame, rename columns, and handle missing data.

    Parameters:
    - folder_path: Path to the folder containing the JSON files.
    - direction: 'DE_to_EN' or 'EN_to_DE'.
    - shot: An integer for shot number or 'finetuning'.

    Returns:
    - A pandas DataFrame with the loaded data.
    """
    # Construct file and column names based on 'shot'
    if isinstance(shot, int):
        formatted_shot = f"{shot:03}"
        file_name = f"{direction}_{formatted_shot}_example_prompt.json"
    elif shot == 'finetuning':
        file_name = f"{direction}_finetuning.json"
    else:
        print(f"Invalid shot value: {shot}")
        return pd.DataFrame()

    file_path = os.path.join(folder_path, file_name)

    # Check if the file exists and load data
    if os.path.exists(file_path):
        print(f"File exists: {file_path}")
        try:

            # Read the JSON file into a DataFrame
            df = pd.read_json(file_path, lines=True)

            # Remove 'text' column if it exists
            df.drop(columns=['text'], inplace=True, errors='ignore')

            return df
        
        except ValueError as e:
            print(f"Error reading JSON file {file_name}: {e}")
            return pd.DataFrame()
    else:
        print(f"File does not exist: {file_path}")
        return pd.DataFrame()

# Load and Merge DataFrames

In [6]:
# Initialize an empty list to hold DataFrames
dataframes = []

# Loop through folders and shots to load data
for direction in ['DE_to_EN', 'EN_to_DE']:
    for shot in shots_list: # + ['finetuning']: # change for finetuning
        
        # Determine base folder ('finetuning' or 'icl')
        base_folder = 'finetuning' if shot == 'finetuning' else 'icl'
        folder_path = os.path.join(results_dir, base_folder, direction)
        print(f"\nProcessing directory: {folder_path}")

        df = load_json_to_df(folder_path, direction, shot)
        if not df.empty:
            print(f"Loaded data for direction '{direction}', shot: {shot}")
            dataframes.append(df)
        else:
            print(f"No data loaded for direction '{direction}', shot: {shot}")


Processing directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN
File exists: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_000_example_prompt.json
Loaded data for direction 'DE_to_EN', shot: 0

Processing directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN
File exists: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_001_example_prompt.json
Loaded data for direction 'DE_to_EN', shot: 1

Processing directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN
File exists: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_002_example_prompt.json
Loaded data for direction 'DE_to_EN', shot: 2

Processing directo

# Merge All DataFrames

In [7]:
# Merge all DataFrames on 'English' and 'Early Modern Bohemian German' columns
if dataframes:
    # Perform an outer merge to combine all DataFrames
    merged_df = reduce(lambda left, right: pd.merge(
        left, right, on=['English', 'Early Modern Bohemian German'], how='outer'), dataframes)
    print("\nAll DataFrames merged successfully.")
    
    # Perform an inner merge to find matches across all DataFrames
    matches_df = reduce(lambda left, right: pd.merge(
        left, right, on=['English', 'Early Modern Bohemian German'], how='inner'), dataframes)
    num_matches = matches_df.shape[0]
    print(f"Number of matches across all DataFrames: {num_matches}")
else:
    merged_df = pd.DataFrame()
    print("\nNo DataFrames to merge.")


All DataFrames merged successfully.
Number of matches across all DataFrames: 1000


In [8]:
# Check column names
merged_df.columns

Index(['Early Modern Bohemian German', 'English',
       'DE_to_EN_000_example_prompt', 'DE_to_EN_001_example_prompt',
       'DE_to_EN_002_example_prompt', 'DE_to_EN_004_example_prompt',
       'DE_to_EN_008_example_prompt', 'DE_to_EN_016_example_prompt',
       'DE_to_EN_032_example_prompt', 'DE_to_EN_064_example_prompt',
       'DE_to_EN_128_example_prompt', 'EN_to_DE_000_example_prompt',
       'EN_to_DE_001_example_prompt', 'EN_to_DE_002_example_prompt',
       'EN_to_DE_004_example_prompt', 'EN_to_DE_008_example_prompt',
       'EN_to_DE_016_example_prompt', 'EN_to_DE_032_example_prompt',
       'EN_to_DE_064_example_prompt', 'EN_to_DE_128_example_prompt'],
      dtype='object')

# Define Column Order and Reorder DataFrame

In [9]:
# Define the column order
column_order = ['Early Modern Bohemian German', 'English']

# Append DE_to_EN columns
de_to_en_columns = [f"DE_to_EN_{shot:03}_example_prompt" for shot in sorted(shots_list)]
de_to_en_columns.append("DE_to_EN_finetuning")
column_order.extend(de_to_en_columns)

# Append EN_to_DE columns
en_to_de_columns = [f"EN_to_DE_{shot:03}_example_prompt" for shot in sorted(shots_list)]
en_to_de_columns.append("EN_to_DE_finetuning")
column_order.extend(en_to_de_columns)

# Reorder the DataFrame columns
existing_columns = [col for col in column_order if col in merged_df.columns]
merged_df = merged_df[existing_columns]

print("\nColumns in the merged DataFrame:")
merged_df.columns.tolist()


Columns in the merged DataFrame:


['Early Modern Bohemian German',
 'English',
 'DE_to_EN_000_example_prompt',
 'DE_to_EN_001_example_prompt',
 'DE_to_EN_002_example_prompt',
 'DE_to_EN_004_example_prompt',
 'DE_to_EN_008_example_prompt',
 'DE_to_EN_016_example_prompt',
 'DE_to_EN_032_example_prompt',
 'DE_to_EN_064_example_prompt',
 'DE_to_EN_128_example_prompt',
 'EN_to_DE_000_example_prompt',
 'EN_to_DE_001_example_prompt',
 'EN_to_DE_002_example_prompt',
 'EN_to_DE_004_example_prompt',
 'EN_to_DE_008_example_prompt',
 'EN_to_DE_016_example_prompt',
 'EN_to_DE_032_example_prompt',
 'EN_to_DE_064_example_prompt',
 'EN_to_DE_128_example_prompt']

# Clean merged_df

In [10]:
# Function to clean text entries
def clean_text(text):

    # Remove unnecessary double white-spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Remove line breaks
    text = re.sub(r'[\n\r]+', ' ', text)
    
    # Remove unnecessary spaces before punctuation
    text = re.sub(r'\s+([.,;!?])', r'\1', text)
    
    # Fix spacing around apostrophes (replacing ’ with ')
    text = re.sub(r'\s+’\s+', r"'", text)
    
    # Fix double spaces or unnecessary whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [11]:
# Apply clean_text to each column in merged_df
for column in merged_df.columns:
    merged_df[column] = merged_df[column].apply(lambda x: clean_text(str(x)))

In [12]:
# Drop NA values and count how many 'NA' or missing values are dropped
na_values_removed = merged_df.isin(['NA', None]).sum()
merged_df = merged_df.replace('NA', None).dropna()

# Trim whitespaces and remove line breaks
merged_df = merged_df.map(lambda x: x.strip().replace('\n', '') if isinstance(x, str) else x)

# Reporting the number of 'NA' or missing values removed
print('NA values removed:', na_values_removed)

NA values removed: Early Modern Bohemian German    0
English                         0
DE_to_EN_000_example_prompt     0
DE_to_EN_001_example_prompt     0
DE_to_EN_002_example_prompt     0
DE_to_EN_004_example_prompt     0
DE_to_EN_008_example_prompt     0
DE_to_EN_016_example_prompt     0
DE_to_EN_032_example_prompt     0
DE_to_EN_064_example_prompt     0
DE_to_EN_128_example_prompt     0
EN_to_DE_000_example_prompt     0
EN_to_DE_001_example_prompt     0
EN_to_DE_002_example_prompt     0
EN_to_DE_004_example_prompt     0
EN_to_DE_008_example_prompt     0
EN_to_DE_016_example_prompt     0
EN_to_DE_032_example_prompt     0
EN_to_DE_064_example_prompt     0
EN_to_DE_128_example_prompt     0
dtype: int64


In [13]:
# Check merged_df
merged_df.head()

Unnamed: 0,Early Modern Bohemian German,English,DE_to_EN_000_example_prompt,DE_to_EN_001_example_prompt,DE_to_EN_002_example_prompt,DE_to_EN_004_example_prompt,DE_to_EN_008_example_prompt,DE_to_EN_016_example_prompt,DE_to_EN_032_example_prompt,DE_to_EN_064_example_prompt,DE_to_EN_128_example_prompt,EN_to_DE_000_example_prompt,EN_to_DE_001_example_prompt,EN_to_DE_002_example_prompt,EN_to_DE_004_example_prompt,EN_to_DE_008_example_prompt,EN_to_DE_016_example_prompt,EN_to_DE_032_example_prompt,EN_to_DE_064_example_prompt,EN_to_DE_128_example_prompt
0,1. dz Sie verschienen freytag als den 29 huig ...,"1. last Friday, the 29th, they came up to the ...",They arrived on the 29th day in great haste to...,"The village headman of Friedland, named Hans N...",They appeared free on the 29th day of high in ...,They arrived freely on the 29th of high day at...,They arrived freely on the 29th of high day in...,They arrived freely on the 29th of high in gre...,They arrived freely on the 29th of high in gre...,They arrived freely on the 29th of high in gre...,1. That you appeared on the 29th of high day a...,"1. Am letzten Freitag, dem 29sten, kamen sie z...","Der Schulteß Zu Friedland, kam zu dem Burhause...","1. Gestern Freitag, der 29ten, kamen sie in gr...","1. Gestern Freitag, den 29ten, kamen sie in gr...","Von Freitag, den 29. Maien, kamen sie in große...","Dem Tagt Friddays, den 29. Octobris, kamen sie...","Dem Tagelichsten Freitag, den 29. September, k...",Der Herrscher Zu Friedlandt kam am letzten Fre...,Demnach sie alle zu dem Schloss Fridland in gr...
1,Es ist Vor.12. iharen Vnnd mehr Zu Oberweigßdo...,"12 years ago and more, in Oberweigsdorf, Paul ...",It is before the 12th of April in Oberweigßdor...,It is before the 12th of January and more to O...,It is before the 12th of January and more to O...,It is reported. 12 years ago and more To Oberw...,It is before the 12th of their own and more to...,"It is known that Paul Appelt, deceased in Ober...","It is reported that Paul Apelt, deceased in Ob...","It is reported that Paul Apelt, who has been d...","Paul Apelt, who died on the 12th of their ance...",Zwölfig Jahre und mehr zuvor in Oberweigsdorf ...,Zwey jahre und mehr zuvor in Oberrhegisdorf is...,"Twelff Jahre vorher und mehr in Obergerwitz, i...",Twelff Jahre und mehr in Oberweygsdorf ist Pau...,"Twelff Jahre und mehr, in Obergerwiesdorff, Pa...",Vor 12 jaren und mehr in Obergerweigsdorf ist ...,Vier Jahre und mehr Vorher in Oberschleißheim ...,12 jahre und mehr Vorher in Oberweigsdorff sta...,Der alte Paul Apelt in Oberweigsdorff ist 12 j...
2,3. Novembris. Matz Apelt beschweret sich Vber ...,3. Novembris. Matz Apelt complains against Jac...,November. Matz Apelt complains about Jacob Sch...,November. Mathias Apelt complains about Jacob ...,"The complainant, Matz Apelt, complains about J...",November. Matz Apelt complains about Jacob Sch...,The 3rd of November. Matz Apelt complains agai...,"On 3 November, Matz Apelt complains about Jaco...","On 3 November, Matz Apelt complains against Ja...",3. Novembris. Matz Apelt complains against Jac...,3. November. Matz Apelt complains against Jaco...,Dreizehnter November. Matz Apelt klagt gegen J...,Dreys Ember Maii. Matz Apelt klagt wider Jacob...,Dreynembris. Matz Apelt klagt gegen Jacoeb Sch...,Dreissigste November. Matz Apelt klagt gegen J...,Drei Novembres. Matz Apelt klaget gegen Jacobi...,Dem 3. Novembris. Matz Apelt klaget gegen Jacc...,Dem 3. Novemb. Matz Apelt klaget gegen Jaccho ...,"Matz Apelt klaget Jacob Schwedtner, weil er si...",Demnach Matz Apelt Zu Friedlandt Klaget gegen ...
3,"Adam Kötter von d. Wittige, demnach er sich mi...","Adam Koetter from the Wittige, since he disput...","Adam Kötter von der Witte, according to whom h...","The village headman of Wittige, according to w...","Adam Kötter from Wittige, since he has been im...","Adam Kötter from Wittige, since he has quarrel...","The village headman of Wittige, since he had q...","Adam Kötter from Wittige, since he had with Ur...","Adam Kötter from the witty, since he with Ursu...","Adam Kötter from the Wittige, since he quarrel...","Adam Kötter from the Wittige, since he with Vr...","Adam Koetter von dem Wittigen, weil er in Wort...","Der Schultheß zu Witte, hat Adam Kötter von de...","Adam Koetter von der Wittige, demnach er in Wo...","Adam Koetter von Wittig, demnach er mit Ursula...","Adam Koetter von Wittige, demnach er mit Worte...","Dem Adam Koettert Zu Wittig, da er in Worten s...","Demnach Adam Koetter Zu Wittig, weil er mit Wo...","Adam Koetter Von der Wittige hat angelobt, das...","Adam Koetter Von der Wittige, demnach er mit U..."
4,Adam Krauß zumb Einsiedel saget auff sein gewi...,Adam Krauss in Einsiedel declares on his consc...,"Adam Krauß says on his oath that he has sworn,...","Adam Krauß says on his oath, I will do it upon...","Adam Krauß to Einsiedel says, upon his oath, t...",Adam Krauß says to his wife about what has hap...,"Adam Krauß says on his oath, that he wishes to...","Adam Krauß says on his oath, that he wishes to...","Adam Krauß says on his oath, that he wishes to...","Adam Krauß says to his confession, that he int...",Adam Krauß from Einsiedel says that he intends...,Adam Krauß in Einsiedl spricht auf seiner Gewi...,Adam Krauß zu Einsiedl spricht auf seiner Gewi...,Adam Krauß in Einsiedel bekennt sich auf sein ...,"Adam Krauß in Einsiedel, auf seiner Gewissen d...","Adam Krauß in Einsiedel, auf seiner Gewissheit...","Dem Adam Krauss in Einsiedel, auf seiner Gewis...","Dem Adam Krauss in Einsiedel, auf seiner Gewis...",Adam Krauss in Einsiedel klagt auf seiner Gewi...,Adam Krauss in Einsiedel beclagt auf seiner Ge...


# Save the Merged DataFrame

In [14]:
# Save the merged DataFrame to a JSON file
output_file = os.path.join(results_dir, "merged_results.json")
merged_df.to_json(output_file, orient='records', lines=True, force_ascii=False)

print(f"\nMerged DataFrame saved to {output_file}")
print('Done!')


Merged DataFrame saved to /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/merged_results.json
Done!


# Summary of Paths Used

In [15]:
print("\nSummary of Paths Used:")
print(f"Base path: {base_path}")
print(f"Results directory: {results_dir}")
print(f"Output file: {output_file}")

print("\nData directories and file checks:")
for direction in ['DE_to_EN', 'EN_to_DE']:
    for shot in shots_list + ['finetuning']:
        base_folder = 'finetuning' if shot == 'finetuning' else 'icl'
        folder_path = os.path.join(results_dir, base_folder, direction)
        if isinstance(shot, int):
            formatted_shot = f"{shot:03}"
            file_name = f"{direction}_{formatted_shot}_example_prompt.json"
        elif shot == 'finetuning':
            file_name = f"{direction}_finetuning.json"
        else:
            continue  # Skip invalid shot values
        file_path = os.path.join(folder_path, file_name)
        file_exists = os.path.exists(file_path)
        print(f"File: {file_path} - Exists: {file_exists}")


Summary of Paths Used:
Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Results directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit
Output file: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/merged_results.json

Data directories and file checks:
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_000_example_prompt.json - Exists: True
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_001_example_prompt.json - Exists: True
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_002_example_prompt.json - Exists: True
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Llama-3.2-3B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_004_example_prompt.json - Exists: True
Fil