# Import libraries

In [1]:
import pandas as pd
import os

# Load training dataset

In [6]:
# Base path for the historical_mt directory (relative to the current working directory)
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

# Path to dataset directory
dataset_path = os.path.join(base_path, 'data', 'datasets')

# Path to dataset
train_dataset_path = os.path.join(dataset_path, "train_dataset.xlsx")

# Path to icl examples
translation_direction = "EN_to_DE" # "DE_to_EN" or "EN_to_DE"
icl_examples_path = os.path.join(base_path, 'data', 'icl_examples')

# Print paths
print(train_dataset_path)
print(icl_examples_path)

/cs/student/msc/csml/2023/ngriessh/historical_mt/data/datasets/train_dataset.xlsx
/cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_examples


# Load excel file

In [7]:
# Load excel file
train_dataset = pd.read_excel(train_dataset_path)
train_dataset.head()

Unnamed: 0,Early Modern Bohemian German,English
0,"Der Schulteß Zu Mildenaw, hatt Hanß Nichten vn...",The village headman of Mildenau names Hans Nic...
1,"Jacob Seidel von hainerßdorff, demnach er Mich...","Jacob Seidel from Hainersdorf, since he attack..."
2,"Auf Grundtmans Klage antwortett der Scholtz, d...",To Grundtman's complaint the village headman a...
3,"Hans Nicht Von Mildenaw, demnach er mit seiner...","Hans Nicht von Mildenau, since he got into gre..."
4,"Richter Sagett Vnd clagett, dz Joachim Jacobiz...",The magistrate complains that Joachim Jacobiz ...


# Function to create in-context learning examples

In [8]:
# Define the zero-example prompt with just an empty line.
def create_base_prompt():
    return "\n"  # Single empty line as the base prompt

# Write the generated prompt to a file for 0, 1, 2, 4, 8, 16, ..., 128 examples.
def save_prompts_for_exponential_examples(train_dataset):

    # Create few-shot prompts
    def create_few_shot_prompt(num_examples=0):
        base_prompt = create_base_prompt()
        
        # Skip the header if num_examples is 0
        if num_examples == 0:
            examples_header = ""
        elif num_examples == 1:
            examples_header = "### Example Translation:\n\n"
        else:
            examples_header = "### Example Translations:\n\n"
        
        # Template for the examples with the required line breaks
        if translation_direction == "DE_to_EN":
            example_prompt = """Example {example_num}\nEarly Modern Bohemian German:\n{Old_German}\n\nTranslation to English:\n{Old_English}\n\n\n"""
        else:
            example_prompt = """Example {example_num}\nEnglish:\n{Old_English}\n\nTranslation to Early Modern Bohemian German:\n{Old_German}\n\n\n"""

        # Extract examples from the train_dataset
        examples = [
            {"Old_German": row['Early Modern Bohemian German'], "Old_English": row['English']}
            for _, row in train_dataset.iterrows()
        ]

        # Format examples
        formatted_examples = ""
        for i in range(min(num_examples, len(examples))):
            formatted_examples += example_prompt.format(
                Old_German=examples[i]["Old_German"], 
                Old_English=examples[i]["Old_English"], 
                example_num=i+1
            )
        
        # Combine the base prompt (empty line), the header, and the formatted examples
        few_shot_prompt = base_prompt + examples_header + formatted_examples.strip() + "\n" if num_examples > 0 else ""
        
        return few_shot_prompt

    # Save prompts for 0, 1, 2, 4, 8, 16, ..., 128 examples.
    example_counts = [0] + [2**i for i in range(8)]  # 0, 1, 2, 4, ..., 128
    
    for num_examples in example_counts:
        prompt = create_few_shot_prompt(num_examples=num_examples)
        # Format the num_examples with 3 digits, e.g., 000, 001, 002...
        formatted_num_examples = f"{num_examples:03}"
        # Save to the provided ICL examples path
        save_file_path = f"{icl_examples_path}/{translation_direction}/{formatted_num_examples}_example_prompt.txt"

        with open(save_file_path, 'w') as f:
            f.write(prompt)
        print(f"Saved prompt with {num_examples} examples to {save_file_path}")

# Construct ICL examples

In [9]:
# Construct ICL examples for translation direction
print(f"Constructing ICL examples for {translation_direction} translation direction")
save_prompts_for_exponential_examples(train_dataset)

Constructing ICL examples for EN_to_DE translation direction
Saved prompt with 0 examples to /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_examples/EN_to_DE/000_example_prompt.txt
Saved prompt with 1 examples to /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_examples/EN_to_DE/001_example_prompt.txt
Saved prompt with 2 examples to /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_examples/EN_to_DE/002_example_prompt.txt
Saved prompt with 4 examples to /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_examples/EN_to_DE/004_example_prompt.txt
Saved prompt with 8 examples to /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_examples/EN_to_DE/008_example_prompt.txt
Saved prompt with 16 examples to /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_examples/EN_to_DE/016_example_prompt.txt
Saved prompt with 32 examples to /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_examples/EN_to_DE/032_example_prompt.txt
Saved prompt with 64