# Import libraries

In [1]:
import os
import re
import pandas as pd
from functools import reduce

# Set Paths

In [2]:
# Base path
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..'))

# Model Parameters
unsloth_model_name = 'unsloth/gemma-2-9b-it-bnb-4bit'
company_name = 'alphabet'
model_name = unsloth_model_name.split('/')[1]

In [5]:
# Results directory path
results_dir = os.path.join(base_path, 'results', company_name, model_name)

# List of shot values

shots_list = [2**i for i in range(3)] 
shots_list.insert(0, 0)
#shots_list = [] # change for finetuning

# Print paths
print(f'Base path: {base_path}')
print(f'Results directory: {results_dir}')
print(f'Shots list: {shots_list}')

Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Results directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit
Shots list: [0, 1, 2, 4]


# Function to Load JSON Files into DataFrames

In [6]:
def load_json_to_df(folder_path, direction, shot):
    """
    Load JSON files into a DataFrame, rename columns, and handle missing data.

    Parameters:
    - folder_path: Path to the folder containing the JSON files.
    - direction: 'DE_to_EN' or 'EN_to_DE'.
    - shot: An integer for shot number or 'finetuning'.

    Returns:
    - A pandas DataFrame with the loaded data.
    """
    # Construct file and column names based on 'shot'
    if isinstance(shot, int):
        formatted_shot = f"{shot:03}"
        file_name = f"{direction}_{formatted_shot}_example_prompt.json"
    elif shot == 'finetuning':
        file_name = f"{direction}_finetuning.json"
    else:
        print(f"Invalid shot value: {shot}")
        return pd.DataFrame()

    file_path = os.path.join(folder_path, file_name)

    # Check if the file exists and load data
    if os.path.exists(file_path):
        print(f"File exists: {file_path}")
        try:

            # Read the JSON file into a DataFrame
            df = pd.read_json(file_path, lines=True)

            # Remove 'text' column if it exists
            df.drop(columns=['text'], inplace=True, errors='ignore')

            return df
        
        except ValueError as e:
            print(f"Error reading JSON file {file_name}: {e}")
            return pd.DataFrame()
    else:
        print(f"File does not exist: {file_path}")
        return pd.DataFrame()

# Load and Merge DataFrames

In [10]:
# Initialize an empty list to hold DataFrames
dataframes = []

# Loop through folders and shots to load data
for direction in ['DE_to_EN', 'EN_to_DE']:
    for shot in shots_list:# + ['finetuning']: # change for finetuning
        
        # Determine base folder ('finetuning' or 'icl')
        base_folder = 'finetuning' if shot == 'finetuning' else 'icl'
        folder_path = os.path.join(results_dir, base_folder, direction)
        print(f"\nProcessing directory: {folder_path}")

        df = load_json_to_df(folder_path, direction, shot)
        if not df.empty:
            print(f"Loaded data for direction '{direction}', shot: {shot}")
            dataframes.append(df)
        else:
            print(f"No data loaded for direction '{direction}', shot: {shot}")


Processing directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN
File exists: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN/DE_to_EN_000_example_prompt.json
Loaded data for direction 'DE_to_EN', shot: 0

Processing directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN
File exists: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN/DE_to_EN_001_example_prompt.json
Loaded data for direction 'DE_to_EN', shot: 1

Processing directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN
File exists: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN/DE_to_EN_002_example_prompt.json
Loaded data for direction 'DE_to_EN', shot: 2

Processing directory: /cs/student/msc/csml

# Merge All DataFrames

In [11]:
# Merge all DataFrames on 'English' and 'Early Modern Bohemian German' columns
if dataframes:
    # Perform an outer merge to combine all DataFrames
    merged_df = reduce(lambda left, right: pd.merge(
        left, right, on=['English', 'Early Modern Bohemian German'], how='outer'), dataframes)
    print("\nAll DataFrames merged successfully.")
    
    # Perform an inner merge to find matches across all DataFrames
    matches_df = reduce(lambda left, right: pd.merge(
        left, right, on=['English', 'Early Modern Bohemian German'], how='inner'), dataframes)
    num_matches = matches_df.shape[0]
    print(f"Number of matches across all DataFrames: {num_matches}")
else:
    merged_df = pd.DataFrame()
    print("\nNo DataFrames to merge.")


All DataFrames merged successfully.
Number of matches across all DataFrames: 1000


In [12]:
# Check column names
merged_df.columns

Index(['Early Modern Bohemian German', 'English',
       'DE_to_EN_000_example_prompt', 'DE_to_EN_001_example_prompt',
       'DE_to_EN_002_example_prompt', 'DE_to_EN_004_example_prompt',
       'EN_to_DE_000_example_prompt', 'EN_to_DE_001_example_prompt',
       'EN_to_DE_002_example_prompt', 'EN_to_DE_004_example_prompt'],
      dtype='object')

# Define Column Order and Reorder DataFrame

In [13]:
# Define the column order
column_order = ['Early Modern Bohemian German', 'English']

# Append DE_to_EN columns
de_to_en_columns = [f"DE_to_EN_{shot:03}_example_prompt" for shot in sorted(shots_list)]
de_to_en_columns.append("DE_to_EN_finetuning")
column_order.extend(de_to_en_columns)

# Append EN_to_DE columns
en_to_de_columns = [f"EN_to_DE_{shot:03}_example_prompt" for shot in sorted(shots_list)]
en_to_de_columns.append("EN_to_DE_finetuning")
column_order.extend(en_to_de_columns)

# Reorder the DataFrame columns
existing_columns = [col for col in column_order if col in merged_df.columns]
merged_df = merged_df[existing_columns]

print("\nColumns in the merged DataFrame:")
merged_df.columns.tolist()


Columns in the merged DataFrame:


['Early Modern Bohemian German',
 'English',
 'DE_to_EN_000_example_prompt',
 'DE_to_EN_001_example_prompt',
 'DE_to_EN_002_example_prompt',
 'DE_to_EN_004_example_prompt',
 'EN_to_DE_000_example_prompt',
 'EN_to_DE_001_example_prompt',
 'EN_to_DE_002_example_prompt',
 'EN_to_DE_004_example_prompt']

# Clean merged_df

In [14]:
# Function to clean text entries
def clean_text(text):
    
    # Remove specific unwanted strings in the best order for optimal cleaning
    patterns_to_remove = [
        r'>',  # Remove '>' when at the beginning
        r'^:',  # Remove ':' when it occurs at the beginning of the string
        r'`',  # Remove all backticks
        r'\*\*Translating Text:\*\*',  # Remove "**Translating Text:**"
        r'\*',  # Remove all asterisks after removing previous patterns
        r'#',  # Remove all hashtags
        r'translation:',  # Remove all occurrences of "translation:"
        r'English',  # Remove "English"
        r'english',  # Remove "english"
        r'<eos>',  # Remove "<eos>"
        r'<end_of_turn>',  # Remove "<end_of_turn>"
        r'early modern bohemia',  # Remove "early modern bohemia"
        r'early modern bohemian german',  # Remove "early modern bohemian german"
        r'Early Modern Bohemian Text'  # Remove "Early Modern Bohemian Text"
    ]
    
    # Remove patterns from the text
    for pattern in patterns_to_remove:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Remove unnecessary double white-spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Remove line breaks
    text = re.sub(r'[\n\r]+', ' ', text)
    
    # Remove unnecessary spaces before punctuation
    text = re.sub(r'\s+([.,;!?])', r'\1', text)
    
    # Fix spacing around apostrophes (replacing ’ with ')
    text = re.sub(r'\s+’\s+', r"'", text)
    
    # Fix double spaces or unnecessary whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [15]:
# Apply clean_text to each column in merged_df
for column in merged_df.columns:
    merged_df[column] = merged_df[column].apply(lambda x: clean_text(str(x)))

In [16]:
# Drop NA values and count how many 'NA' or missing values are dropped
na_values_removed = merged_df.isin(['NA', None]).sum()
merged_df = merged_df.replace('NA', None).dropna()

# Trim whitespaces and remove line breaks
merged_df = merged_df.map(lambda x: x.strip().replace('\n', '') if isinstance(x, str) else x)

# Reporting the number of 'NA' or missing values removed
print('NA values removed:', na_values_removed)

NA values removed: Early Modern Bohemian German    0
English                         0
DE_to_EN_000_example_prompt     0
DE_to_EN_001_example_prompt     0
DE_to_EN_002_example_prompt     0
DE_to_EN_004_example_prompt     0
EN_to_DE_000_example_prompt     0
EN_to_DE_001_example_prompt     0
EN_to_DE_002_example_prompt     0
EN_to_DE_004_example_prompt     0
dtype: int64


In [17]:
# Check merged_df
merged_df.head()

Unnamed: 0,Early Modern Bohemian German,English,DE_to_EN_000_example_prompt,DE_to_EN_001_example_prompt,DE_to_EN_002_example_prompt,DE_to_EN_004_example_prompt,EN_to_DE_000_example_prompt,EN_to_DE_001_example_prompt,EN_to_DE_002_example_prompt,EN_to_DE_004_example_prompt
0,1. dz Sie verschienen freytag als den 29 huig ...,"1. last Friday, the 29th, they came up to the ...",If you appear on Friday as the 29th in great n...,"They appeared on Friday, the 29th, in large nu...",1. When they appeared Friday as the 29th in la...,1. When they appeared Friday as the 29th in la...,"Vergangene Freitag den XXIX., kamen sie zu ein...",Ersterlich zu wissen ist es so gewesen; letzte...,"1. Letzten Freitagm dem 29., kamen sie zahlrei...","1. Letzten Freitagm, dem 29., kamen sie zahlre..."
1,Es ist Vor.12. iharen Vnnd mehr Zu Oberweigßdo...,"12 years ago and more, in Oberweigsdorf, Paul ...",It is before 12th [month] in their village mor...,It was before twelve years ago in Oberweigsdor...,It is before 12 o'clock here and more at Oberw...,Before twelve o’clock yesterday at Oberweigsdo...,Vor zwölff Jahren und mehr ist zu Oberweigsdor...,Vor zwölff Jahren und mehr zu Obernwegsdorff i...,Vor zwölff jahren und mehrn im oberweiggsdorff...,Vor 12 Jaren und mehr ist obir weiggsdorff pau...
2,3. Novembris. Matz Apelt beschweret sich Vber ...,3. Novembris. Matz Apelt complains against Jac...,November 3rd. Matz Apelt complains about Jacob...,November 3rd. Matthias Apel complains about Ja...,3 November. Matthias Apel complains about Jaco...,3 November. Matz Apel complains about Jacob Sc...,Dritte Novēmberis. Matz Apel klagt wider Jakob...,Den 3.Novembres beschwert sich Matts Apelth wi...,Den 3.Novembres. Matts Apel klagt wider Jakob ...,3.Novembres. Matthäus Apel klagte wider Jakob ...
3,"Adam Kötter von d. Wittige, demnach er sich mi...","Adam Koetter from the Wittige, since he disput...","Adam Kötter von der Wittege, inasmuch as he hi...","Adam Kötter von der Witte, inasmuch as he has ...","Adam Kötter from Wittig, because he quarreled ...","Adam Kötter von d. Wittig, since he quarreled ...","Adam Kötter von dem Witige, ob er mit Worten u...","Adame Kötter aus dem Witige, weil er mit Ursel...","Adam Kötter vom Witige, denn er im Wort und wi...","Adam Kötter vom wittigen, demnach er mit Worte..."
4,Adam Krauß zumb Einsiedel saget auff sein gewi...,Adam Krauss in Einsiedel declares on his consc...,Adam Krauss of Einsidtel says on his certain k...,"Adam Krauss of Einsidil says on his word, want...","Adam Krauss of Einsidil says upon his word, wa...","Adam Krauss zum Einsidle says upon his word, w...",Adam Kraus zu Einsidl bekennt auf sein Gewisse...,Adams Kraus zu Einsidil wider sein Gewissen er...,Adam Kraus in Ensiedl erklärt auf sein Gewisse...,Adam krauß in einsiedal erklärt auff sei Gewis...


# Save the Merged DataFrame

In [18]:
# Save the merged DataFrame to a JSON file
output_file = os.path.join(results_dir, "merged_results.json")
merged_df.to_json(output_file, orient='records', lines=True)

print(f"\nMerged DataFrame saved to {output_file}")
print('Done!')


Merged DataFrame saved to /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/merged_results.json
Done!


# Summary of Paths Used

In [19]:
print("\nSummary of Paths Used:")
print(f"Base path: {base_path}")
print(f"Results directory: {results_dir}")
print(f"Output file: {output_file}")

print("\nData directories and file checks:")
for direction in ['DE_to_EN', 'EN_to_DE']:
    for shot in shots_list + ['finetuning']:
        base_folder = 'finetuning' if shot == 'finetuning' else 'icl'
        folder_path = os.path.join(results_dir, base_folder, direction)
        if isinstance(shot, int):
            formatted_shot = f"{shot:03}"
            file_name = f"{direction}_{formatted_shot}_example_prompt.json"
        elif shot == 'finetuning':
            file_name = f"{direction}_finetuning.json"
        else:
            continue  # Skip invalid shot values
        file_path = os.path.join(folder_path, file_name)
        file_exists = os.path.exists(file_path)
        print(f"File: {file_path} - Exists: {file_exists}")


Summary of Paths Used:
Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Results directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit
Output file: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/merged_results.json

Data directories and file checks:
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN/DE_to_EN_000_example_prompt.json - Exists: True
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN/DE_to_EN_001_example_prompt.json - Exists: True
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN/DE_to_EN_002_example_prompt.json - Exists: True
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-9b-it-bnb-4bit/icl/DE_to_EN/DE_to_EN_004_example_prompt.json - Exists: True
File: /cs/student/msc/csml/