In [2]:
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd

In [6]:
df = pd.read_csv('./data/combined_egy_train.csv')

# Create MSA → EGY samples
msa_to_egy = pd.DataFrame({
    'input': 'translate MSA to EGY: ' + df['MSA'].astype(str),
    'target': df['EGY'].astype(str)
})

# Create EGY → MSA samples
egy_to_msa = pd.DataFrame({
    'input': 'translate EGY to MSA: ' + df['EGY'].astype(str),
    'target': df['MSA'].astype(str)
})

# Concatenate both directions
combined_df = pd.concat([msa_to_egy, egy_to_msa], ignore_index=True)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(combined_df)

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/AraT5v2-base-1024")
model = AutoModelForSeq2SeqLM.from_pretrained("UBC-NLP/AraT5v2-base-1024")

In [None]:
# Tokenize function
def preprocess(examples):
    model_inputs = tokenizer(
        examples["input"], max_length=128, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"], max_length=128, truncation=True, padding="max_length"
        )

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


split_dataset = dataset.train_test_split(test_size=0.2, seed=29)

# Tokenize both splits
tokenized = split_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/28958 [00:00<?, ? examples/s]



Map:   0%|          | 0/7240 [00:00<?, ? examples/s]

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./arat5-styletransfer_egy_msa",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,  # simulates batch size of 8
    num_train_epochs=3,
    learning_rate=3e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",        # ← tells it to use logging_steps
    logging_steps=500,               # ← logs every 500 steps
    logging_dir="./logs",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
)

# Train
trainer.train()

# Save the model
trainer.save_model('./arat5-styletransfer_egy_msa')
tokenizer.save_pretrained('./arat5-styletransfer_egy_msa')

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.8915,2.446289
2,2.2713,2.236637
3,1.7661,2.187968


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


('./arat5-styletransfer\\tokenizer_config.json',
 './arat5-styletransfer\\special_tokens_map.json',
 './arat5-styletransfer\\spiece.model',
 './arat5-styletransfer\\added_tokens.json')

In [3]:
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer

# Load your fine-tuned model from the output directory
model_path = "./arat5-styletransfer_egy_msa"

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

model.eval().to("cuda")  # or "cpu" if no GPU

T5ForConditionalGeneration(
  (shared): Embedding(110208, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(110208, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo

In [11]:
test_df = pd.read_csv("./data/combined_egy_dev.csv")

# Format like you did for training
test_data_msa_egy = pd.DataFrame({
    "input": "translate MSA to EGY: " + test_df["MSA"].astype(str),
    "target": test_df["EGY"].astype(str)
})

# Convert to Hugging Face Dataset
test_dataset_msa_egy = Dataset.from_pandas(test_data_msa_egy)

test_data_egy_msa = pd.DataFrame({
    "input": "translate EGY to MSA: " + test_df["EGY"].astype(str),
    "target": test_df["MSA"].astype(str)
})

# Convert to Hugging Face Dataset
test_dataset_egy_msa = Dataset.from_pandas(test_data_egy_msa)

In [14]:
from tqdm import tqdm
import evaluate

def get_bleu_score(dataset, model, tokenizer):
    predictions = []
    references = []

    bleu = evaluate.load("bleu")

    for example in tqdm(dataset):
        input_text = example["input"]
        reference = example["target"]

        # Tokenize input and move to model device
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

        # Generate translation
        outputs = model.generate(**inputs, max_new_tokens=128)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        predictions.append(prediction.strip())
        references.append([reference.strip()])  # BLEU expects list of references

    bleu_score = bleu.compute(predictions=predictions, references=references)
    print("BLEU score:", bleu_score["bleu"])
    return bleu_score["bleu"]

In [15]:
bleu_score_msa_egy = get_bleu_score(test_dataset_msa_egy, model, tokenizer)
bleu_score_egy_msa = get_bleu_score(test_dataset_egy_msa, model, tokenizer)

100%|██████████| 1200/1200 [06:09<00:00,  3.25it/s]


BLEU score: 0.14787988242839034


100%|██████████| 1200/1200 [05:57<00:00,  3.36it/s]

BLEU score: 0.1950627783740159





In [16]:
import pickle

# Save the BLEU score to a file
with open("AraT5_v2_EGY_MSA_bleu_scores.pkl", "wb") as f:
    pickle.dump({
        "msa_to_egy": bleu_score_msa_egy,
        "egy_to_msa": bleu_score_egy_msa
    }, f)
