# Import libraries

In [1]:
import os
import pandas as pd
import json



# Set paths

In [2]:
# Base path
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..', '..'))

# Model Parameters
unsloth_model_name = 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'
company_name = 'meta'
model_name = unsloth_model_name.split('/')[1]

# Translation Directions
translation_directions = ['DE_to_EN', 'EN_to_DE']

# List of shot values
shots_list = [2**i for i in range(7)]
shots_list.insert(0, 0)

# List of formatted shot values
formatted_shots_list = [f"{shot:03d}" for shot in shots_list]
formatted_shots_list = formatted_shots_list + ['finetuning']

# Results directory path
results_dir_path = os.path.join(base_path, 'results', company_name, model_name)

# Save modified files in folder input_files
input_files_dir_path = os.path.join(base_path, 'scripts', company_name, model_name, 'evaluation', 'metricx_evaluation', 'input_files')

# Print paths
print('Base path:', base_path)
print('Model name:', model_name)
print('Company name:', company_name)
print('Translation directions:', translation_directions)
print('Shots list:', shots_list)
print('Formatted shots list:', formatted_shots_list)
print('Results directory path:', results_dir_path)
print('Input files directory path:', input_files_dir_path)

Base path: /Users/niclasgriesshaber/Desktop/historical_mt
Model name: Meta-Llama-3.1-8B-Instruct-bnb-4bit
Company name: meta
Translation directions: ['DE_to_EN', 'EN_to_DE']
Shots list: [0, 1, 2, 4, 8, 16, 32, 64]
Formatted shots list: ['000', '001', '002', '004', '008', '016', '032', '064', 'finetuning']
Results directory path: /Users/niclasgriesshaber/Desktop/historical_mt/results/meta/Meta-Llama-3.1-8B-Instruct-bnb-4bit
Input files directory path: /Users/niclasgriesshaber/Desktop/historical_mt/scripts/meta/Meta-Llama-3.1-8B-Instruct-bnb-4bit/evaluation/metricx_evaluation/input_files


# Function to load json file

In [3]:
# Function to load json file
def load_file(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    return data

# Construct all input files for metricx
Save in input_files folder

In [8]:
for translation_direction in translation_directions:

    # Define source and target language based on translation_direction
    if translation_direction == 'DE_to_EN':
        source_language = "Early Modern Bohemian German"
        target_language = "English"
    else:
        source_language = "English"
        target_language = "Early Modern Bohemian German"

    # For a given translation_direction go through all shots + finetuning
    for shot in formatted_shots_list:
        print(f'Processing {translation_direction}_{shot}')

        if shot == 'finetuning':
            learning_type = 'finetuning'
            hypothesis_column = f'{translation_direction}_{shot}'
            file_path = os.path.join(results_dir_path, learning_type, translation_direction, f'{translation_direction}_{learning_type}.json')
        else:
            learning_type = 'icl'
            hypothesis_column = f'{translation_direction}_{shot}_example_prompt'
            file_path = os.path.join(results_dir_path, learning_type, translation_direction, f'{translation_direction}_{shot}_example_prompt.json')

        # Load json file as DataFrame
        print(f'Load file: {file_path}')
        current_df = pd.DataFrame(load_file(file_path))
        print(current_df.columns)

        # Transform the DataFrame
        transformed_df = current_df[[target_language, hypothesis_column]].rename(columns={target_language: 'source', hypothesis_column: 'hypothesis'})

        # Converting each row to a JSON object, one per line
        json_lines = transformed_df.to_json(orient='records', lines=True)

        # Construct path to input file
        input_file_path = os.path.join(input_files_dir_path, learning_type, translation_direction, f'{hypothesis_column}.json')

        # Save the transformed DataFrame to a file
        with open(input_file_path, 'w') as f:
            f.write(json_lines)
            print(f'Saved to {input_file_path}')

# Print message
print('All files saved successfully!')

Processing DE_to_EN_000
Load file: /Users/niclasgriesshaber/Desktop/historical_mt/results/meta/Meta-Llama-3.1-8B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_000_example_prompt.json
Index(['Early Modern Bohemian German', 'English', 'text',
       'DE_to_EN_000_example_prompt'],
      dtype='object')
Saved to /Users/niclasgriesshaber/Desktop/historical_mt/scripts/meta/Meta-Llama-3.1-8B-Instruct-bnb-4bit/evaluation/metricx_evaluation/input_files/icl/DE_to_EN/DE_to_EN_000_example_prompt.json
Processing DE_to_EN_001
Load file: /Users/niclasgriesshaber/Desktop/historical_mt/results/meta/Meta-Llama-3.1-8B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_001_example_prompt.json
Index(['Early Modern Bohemian German', 'English', 'text',
       'DE_to_EN_001_example_prompt'],
      dtype='object')
Saved to /Users/niclasgriesshaber/Desktop/historical_mt/scripts/meta/Meta-Llama-3.1-8B-Instruct-bnb-4bit/evaluation/metricx_evaluation/input_files/icl/DE_to_EN/DE_to_EN_001_example_prompt.json
Processing DE_to_EN_00

In [25]:
# Create file_result_path
file_result_path = os.path.join(results_dir_path, 'icl', 'DE_to_EN', 'DE_to_EN_000_example_prompt.json')
save_input_file_path = os.path.join(input_files_dir_path, 'icl', 'DE_to_EN_000_example_prompt.json')

In [26]:
# Load the JSON file
with open(file_result_path, 'r') as f:
    data = [json.loads(line) for line in f]

# Create a DataFrame based on the JSON file
df = pd.DataFrame(data)

In [27]:
# Selecting the required columns and renaming them to 'source' and 'hypothesis'
df_selected = df[['English', 'DE_to_EN_000_example_prompt']].rename(
    columns={'English': 'source', 'DE_to_EN_000_example_prompt': 'hypothesis'}
)

# Converting each row to a JSON object, one per line
json_lines = df_selected.to_json(orient='records', lines=True)

In [28]:
# Save json lines to a file
with open(save_input_file_path, 'w') as f:
    f.write(json_lines)