In [1]:
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd

In [2]:
df = pd.read_csv('./data/combined_lev_train.csv')

msa_to_lev = pd.DataFrame({
    'input': 'translate MSA to LEV: ' + df['MSA'].astype(str),
    'target': df['LEV'].astype(str)
})

lev_to_msa = pd.DataFrame({
    'input': 'translate LEV to MSA: ' + df['LEV'].astype(str),
    'target': df['MSA'].astype(str)
})

# Concatenate both directions
combined_df = pd.concat([msa_to_lev, lev_to_msa], ignore_index=True)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(combined_df)

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/AraT5v2-base-1024")
model = AutoModelForSeq2SeqLM.from_pretrained("UBC-NLP/AraT5v2-base-1024")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
# Tokenize function
def preprocess(examples):
    model_inputs = tokenizer(
        examples["input"], max_length=128, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"], max_length=128, truncation=True, padding="max_length"
        )

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


split_dataset = dataset.train_test_split(test_size=0.2, seed=29)

# Tokenize both splits
tokenized = split_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/20961 [00:00<?, ? examples/s]



Map:   0%|          | 0/5241 [00:00<?, ? examples/s]

In [4]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./arat5-styletransfer_lev_msa",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,  # simulates batch size of 8
    num_train_epochs=3,
    learning_rate=3e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",        # ← tells it to use logging_steps
    logging_steps=500,               # ← logs every 500 steps
    logging_dir="./logs",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=False
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
)

# Train
trainer.train()

# Save the model
trainer.save_model('./arat5-styletransfer_lev_msa')
tokenizer.save_pretrained('./arat5-styletransfer_lev_msa')

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.7473,2.24242
2,1.5398,2.030324


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


('./arat5-styletransfer_lev_msa\\tokenizer_config.json',
 './arat5-styletransfer_lev_msa\\special_tokens_map.json',
 './arat5-styletransfer_lev_msa\\spiece.model',
 './arat5-styletransfer_lev_msa\\added_tokens.json')

In [5]:
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer

# Load your fine-tuned model from the output directory
model_path = "./arat5-styletransfer_lev_msa"

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

model.eval().to("cuda")  # or "cpu" if no GPU

T5ForConditionalGeneration(
  (shared): Embedding(110208, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(110208, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo

In [6]:
test_df = pd.read_csv("./data/combined_lev_dev.csv")

# Format like you did for training
test_data_msa_lev = pd.DataFrame({
    "input": "translate MSA to LEV: " + test_df["MSA"].astype(str),
    "target": test_df["LEV"].astype(str)
})

# Convert to Hugging Face Dataset
test_dataset_msa_lev = Dataset.from_pandas(test_data_msa_lev)

test_data_lev_msa = pd.DataFrame({
    "input": "translate LEV to MSA: " + test_df["LEV"].astype(str),
    "target": test_df["MSA"].astype(str)
})

# Convert to Hugging Face Dataset
test_dataset_lev_msa = Dataset.from_pandas(test_data_lev_msa)

In [7]:
from tqdm import tqdm
import evaluate

def get_bleu_score(dataset, model, tokenizer):
    predictions = []
    references = []

    bleu = evaluate.load("bleu")

    for example in tqdm(dataset):
        input_text = example["input"]
        reference = example["target"]

        # Tokenize input and move to model device
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

        # Generate translation
        outputs = model.generate(**inputs, max_new_tokens=128)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        predictions.append(prediction.strip())
        references.append([reference.strip()])  # BLEU expects list of references

    bleu_score = bleu.compute(predictions=predictions, references=references)
    print("BLEU score:", bleu_score["bleu"])
    return bleu_score["bleu"]

In [8]:
bleu_score_msa_lev = get_bleu_score(test_dataset_msa_lev, model, tokenizer)
bleu_score_lev_msa = get_bleu_score(test_dataset_lev_msa, model, tokenizer)

100%|██████████| 1200/1200 [05:25<00:00,  3.69it/s]


BLEU score: 0.16974441065536836


100%|██████████| 1200/1200 [05:30<00:00,  3.63it/s]

BLEU score: 0.23496710733624357





In [9]:
import pickle

# Save the BLEU score to a file
with open("AraT5_v2_LEV_MSA_bleu_scores.pkl", "wb") as f:
    pickle.dump({
        "msa_to_lev": bleu_score_msa_lev,
        "lev_to_msa": bleu_score_lev_msa
    }, f)
