# Import libraries

In [1]:
import os
import re
import pandas as pd
from functools import reduce

# Set Paths

In [2]:
# Base path
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..'))

# Model Parameters
unsloth_model_name = 'unsloth/gemma-2-2b-it-bnb-4bit'
company_name = 'alphabet'
model_name = unsloth_model_name.split('/')[1]

In [3]:
# Results directory path
results_dir = os.path.join(base_path, 'results', company_name, model_name)

# List of shot values

shots_list = [2**i for i in range(3)] 
shots_list.insert(0, 0)
shots_list = [] # change for finetuning

# Print paths
print(f'Base path: {base_path}')
print(f'Results directory: {results_dir}')
print(f'Shots list: {shots_list}')

Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Results directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit
Shots list: []


# Function to Load JSON Files into DataFrames

In [4]:
def load_json_to_df(folder_path, direction, shot):
    """
    Load JSON files into a DataFrame, rename columns, and handle missing data.

    Parameters:
    - folder_path: Path to the folder containing the JSON files.
    - direction: 'DE_to_EN' or 'EN_to_DE'.
    - shot: An integer for shot number or 'finetuning'.

    Returns:
    - A pandas DataFrame with the loaded data.
    """
    # Construct file and column names based on 'shot'
    if isinstance(shot, int):
        formatted_shot = f"{shot:03}"
        file_name = f"{direction}_{formatted_shot}_example_prompt.json"
    elif shot == 'finetuning':
        file_name = f"{direction}_finetuning.json"
    else:
        print(f"Invalid shot value: {shot}")
        return pd.DataFrame()

    file_path = os.path.join(folder_path, file_name)

    # Check if the file exists and load data
    if os.path.exists(file_path):
        print(f"File exists: {file_path}")
        try:

            # Read the JSON file into a DataFrame
            df = pd.read_json(file_path, lines=True)

            # Remove 'text' column if it exists
            df.drop(columns=['text'], inplace=True, errors='ignore')

            return df
        
        except ValueError as e:
            print(f"Error reading JSON file {file_name}: {e}")
            return pd.DataFrame()
    else:
        print(f"File does not exist: {file_path}")
        return pd.DataFrame()

# Load and Merge DataFrames

In [5]:
# Initialize an empty list to hold DataFrames
dataframes = []

# Loop through folders and shots to load data
for direction in ['DE_to_EN', 'EN_to_DE']:
    for shot in shots_list + ['finetuning']: # change for finetuning
        
        # Determine base folder ('finetuning' or 'icl')
        base_folder = 'finetuning' if shot == 'finetuning' else 'icl'
        folder_path = os.path.join(results_dir, base_folder, direction)
        print(f"\nProcessing directory: {folder_path}")

        df = load_json_to_df(folder_path, direction, shot)
        if not df.empty:
            print(f"Loaded data for direction '{direction}', shot: {shot}")
            dataframes.append(df)
        else:
            print(f"No data loaded for direction '{direction}', shot: {shot}")


Processing directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit/finetuning/DE_to_EN
File exists: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit/finetuning/DE_to_EN/DE_to_EN_finetuning.json
Loaded data for direction 'DE_to_EN', shot: finetuning

Processing directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit/finetuning/EN_to_DE
File exists: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit/finetuning/EN_to_DE/EN_to_DE_finetuning.json
Loaded data for direction 'EN_to_DE', shot: finetuning


# Merge All DataFrames

In [6]:
# Merge all DataFrames on 'English' and 'Early Modern Bohemian German' columns
if dataframes:
    # Perform an outer merge to combine all DataFrames
    merged_df = reduce(lambda left, right: pd.merge(
        left, right, on=['English', 'Early Modern Bohemian German'], how='outer'), dataframes)
    print("\nAll DataFrames merged successfully.")
    
    # Perform an inner merge to find matches across all DataFrames
    matches_df = reduce(lambda left, right: pd.merge(
        left, right, on=['English', 'Early Modern Bohemian German'], how='inner'), dataframes)
    num_matches = matches_df.shape[0]
    print(f"Number of matches across all DataFrames: {num_matches}")
else:
    merged_df = pd.DataFrame()
    print("\nNo DataFrames to merge.")


All DataFrames merged successfully.
Number of matches across all DataFrames: 1000


In [7]:
# Check column names
merged_df.columns

Index(['Early Modern Bohemian German', 'English', 'DE_to_EN_finetuning',
       'EN_to_DE_finetuning'],
      dtype='object')

# Define Column Order and Reorder DataFrame

In [8]:
# Define the column order
column_order = ['Early Modern Bohemian German', 'English']

# Append DE_to_EN columns
de_to_en_columns = [f"DE_to_EN_{shot:03}_example_prompt" for shot in sorted(shots_list)]
de_to_en_columns.append("DE_to_EN_finetuning")
column_order.extend(de_to_en_columns)

# Append EN_to_DE columns
en_to_de_columns = [f"EN_to_DE_{shot:03}_example_prompt" for shot in sorted(shots_list)]
en_to_de_columns.append("EN_to_DE_finetuning")
column_order.extend(en_to_de_columns)

# Reorder the DataFrame columns
existing_columns = [col for col in column_order if col in merged_df.columns]
merged_df = merged_df[existing_columns]

print("\nColumns in the merged DataFrame:")
merged_df.columns.tolist()


Columns in the merged DataFrame:


['Early Modern Bohemian German',
 'English',
 'DE_to_EN_finetuning',
 'EN_to_DE_finetuning']

# Clean merged_df

In [9]:
# Function to clean text entries
def clean_text(text):
    
    # Remove specific unwanted strings in the best order for optimal cleaning
    patterns_to_remove = [
        r'>',  # Remove '>' when at the beginning
        r'^:',  # Remove ':' when it occurs at the beginning of the string
        r'`',  # Remove all backticks
        r'\*\*Translating Text:\*\*',  # Remove "**Translating Text:**"
        r'\*',  # Remove all asterisks after removing previous patterns
        r'#',  # Remove all hashtags
        r'translation:',  # Remove all occurrences of "translation:"
        r'English',  # Remove "English"
        r'english',  # Remove "english"
        r'<eos>',  # Remove "<eos>"
        r'<end_of_turn>',  # Remove "<end_of_turn>"
        r'early modern bohemia',  # Remove "early modern bohemia"
        r'early modern bohemian german',  # Remove "early modern bohemian german"
        r'Early Modern Bohemian Text'  # Remove "Early Modern Bohemian Text"
    ]
    
    # Remove patterns from the text
    for pattern in patterns_to_remove:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Remove unnecessary double white-spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Remove line breaks
    text = re.sub(r'[\n\r]+', ' ', text)
    
    # Remove unnecessary spaces before punctuation
    text = re.sub(r'\s+([.,;!?])', r'\1', text)
    
    # Fix spacing around apostrophes (replacing ’ with ')
    text = re.sub(r'\s+’\s+', r"'", text)
    
    # Fix double spaces or unnecessary whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [10]:
# Apply clean_text to each column in merged_df
for column in merged_df.columns:
    merged_df[column] = merged_df[column].apply(lambda x: clean_text(str(x)))

In [11]:
# Drop NA values and count how many 'NA' or missing values are dropped
na_values_removed = merged_df.isin(['NA', None]).sum()
merged_df = merged_df.replace('NA', None).dropna()

# Trim whitespaces and remove line breaks
merged_df = merged_df.map(lambda x: x.strip().replace('\n', '') if isinstance(x, str) else x)

# Reporting the number of 'NA' or missing values removed
print('NA values removed:', na_values_removed)

NA values removed: Early Modern Bohemian German    0
English                         0
DE_to_EN_finetuning             0
EN_to_DE_finetuning             0
dtype: int64


In [12]:
# Check merged_df
merged_df.head()

Unnamed: 0,Early Modern Bohemian German,English,DE_to_EN_finetuning,EN_to_DE_finetuning
0,1. Ein Ambt befehlich an h. Joachim vom Eberha...,1. An manorial court command is to be made to ...,A manorial court's command was issued on appli...,An einen Ambt befehlich an den h.: Jochim Vonn...
1,"1. die lehenß Vnderthanen im Winckel, sollen h...",1. The fief serfs in the Winckel shall hencefo...,The fief-serfs are supposed henceforth for ent...,1. die Lehen Vnterthanen im winckeln sollen hi...
2,1. dz Sie verschienen freytag als den 29 huig ...,"1. last Friday, the 29th, they came up to the ...",That on last Friday namely the 29th they came ...,1st Vnterschreibungen am Freytag Verwichener Z...
3,Es ist Vor.12. iharen Vnnd mehr Zu Oberweigßdo...,"12 years ago and more, in Oberweigsdorf, Paul ...",Twelve years ago in Upper Weigsdorff died off ...,Vor Vngefehr Zwantzig Jahren vnd mehr ist Zu o...
4,Ao 1661. Martius. Mildenaw. Christoph blumbrig...,1661. march. Mildenaw. Christoph Blumbrig's co...,March Anno 1661 in mildenau; complaint of Chri...,Anno p 1661 Martij


# Save the Merged DataFrame

In [13]:
# Save the merged DataFrame to a JSON file
output_file = os.path.join(results_dir, "merged_results.json")
merged_df.to_json(output_file, orient='records', lines=True)

print(f"\nMerged DataFrame saved to {output_file}")
print('Done!')


Merged DataFrame saved to /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit/merged_results.json
Done!


# Summary of Paths Used

In [14]:
print("\nSummary of Paths Used:")
print(f"Base path: {base_path}")
print(f"Results directory: {results_dir}")
print(f"Output file: {output_file}")

print("\nData directories and file checks:")
for direction in ['DE_to_EN', 'EN_to_DE']:
    for shot in shots_list + ['finetuning']:
        base_folder = 'finetuning' if shot == 'finetuning' else 'icl'
        folder_path = os.path.join(results_dir, base_folder, direction)
        if isinstance(shot, int):
            formatted_shot = f"{shot:03}"
            file_name = f"{direction}_{formatted_shot}_example_prompt.json"
        elif shot == 'finetuning':
            file_name = f"{direction}_finetuning.json"
        else:
            continue  # Skip invalid shot values
        file_path = os.path.join(folder_path, file_name)
        file_exists = os.path.exists(file_path)
        print(f"File: {file_path} - Exists: {file_exists}")


Summary of Paths Used:
Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Results directory: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit
Output file: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit/merged_results.json

Data directories and file checks:
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit/finetuning/DE_to_EN/DE_to_EN_finetuning.json - Exists: True
File: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit/finetuning/EN_to_DE/EN_to_DE_finetuning.json - Exists: True
