In [1]:
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
import torch

In [2]:
def generate_samples_csv(
    df,                        # your pandas DataFrame
    model_path,                # path to your fine-tuned model
    source_col,                # column name for source text
    target_col,                # column name for reference translation
    direction_prompt,          # e.g. "translate MSA to EGY:"
    num_samples=10,            # number of random examples to sample
    output_file="output.csv"   # name of the CSV to save
):
    # 1. Sample from the data
    sample_df = df[[source_col, target_col]].dropna().sample(n=num_samples, random_state=29).reset_index(drop=True)

    # 2. Load model & tokenizer
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda").eval()
    tokenizer = T5Tokenizer.from_pretrained(model_path)

    # 3. Generate translations
    predictions = []
    for i, row in sample_df.iterrows():
        input_text = f"{direction_prompt} {row[source_col]}"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=128).to(model.device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=128)

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(decoded.strip())

    # 4. Save to CSV
    output_df = pd.DataFrame({
        f"{source_col}": sample_df[source_col],
        f"Original {target_col} Translation": sample_df[target_col],
        f"Generated {target_col} Translation": predictions
    })

    output_df.to_csv(output_file, index=False)


In [3]:
em_df = pd.read_csv("./data/combined_egy_dev.csv")
lm_df = pd.read_csv("./data/combined_lev_dev.csv")

output_folder = "./generated_samples/"

em_model_path = "./arat5-styletransfer_egy_msa"
lm_model_path = "./arat5-styletransfer_lev_msa"

generate_samples_csv(df=em_df, model_path=em_model_path, source_col="MSA", target_col="EGY", direction_prompt="translate MSA to EGY:", num_samples=10, output_file=f"{output_folder}Samples_MSA_EGY.csv")

generate_samples_csv(df=lm_df, model_path=lm_model_path, source_col="MSA", target_col="LEV", direction_prompt="translate MSA to LEV:", num_samples=10, output_file=f"{output_folder}Samples_MSA_LEV.csv")

generate_samples_csv(df=lm_df, model_path=lm_model_path, source_col="LEV", target_col="MSA", direction_prompt="translate LEV to MSA:", num_samples=10, output_file=f"{output_folder}Samples_LEV_MSA.csv")

generate_samples_csv(df=em_df, model_path=em_model_path, source_col="EGY", target_col="MSA", direction_prompt="translate EGY to MSA:", num_samples=10, output_file=f"{output_folder}Samples_EGY_MSA.csv")

In [6]:
def csv_to_formatted_txt(csv_file, txt_file):
    df = pd.read_csv(csv_file)

    with open(txt_file, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            for col in df.columns:
                f.write(f"{col}:\n{row[col]}\n")
            f.write("\n")  # empty line between rows

csv_to_formatted_txt(f"{output_folder}Samples_MSA_EGY.csv", f"{output_folder}Samples_MSA_EGY.txt")
csv_to_formatted_txt(f"{output_folder}Samples_MSA_LEV.csv", f"{output_folder}Samples_MSA_LEV.txt")
csv_to_formatted_txt(f"{output_folder}Samples_LEV_MSA.csv", f"{output_folder}Samples_LEV_MSA.txt")
csv_to_formatted_txt(f"{output_folder}Samples_EGY_MSA.csv", f"{output_folder}Samples_EGY_MSA.txt")