In [1]:
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
import pandas as pd

In [2]:
# Load the baseline AraT5v2 model
model_name = "UBC-NLP/AraT5v2-base-1024"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.eval().to("cuda")  # or "cpu" if no GPU

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(110208, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(110208, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo

In [3]:
test_df = pd.read_csv("./data/combined_lev_dev.csv")

# Format like you did for training
test_data_msa_lev = pd.DataFrame({
    "input": "translate MSA to LEV: " + test_df["MSA"].astype(str),
    "target": test_df["LEV"].astype(str)
})

# Convert to Hugging Face Dataset
test_dataset_msa_lev = Dataset.from_pandas(test_data_msa_lev)

test_data_lev_msa = pd.DataFrame({
    "input": "translate LEV to MSA: " + test_df["LEV"].astype(str),
    "target": test_df["MSA"].astype(str)
})

# Convert to Hugging Face Dataset
test_dataset_lev_msa = Dataset.from_pandas(test_data_lev_msa)

In [4]:
from tqdm import tqdm
import evaluate

def get_bleu_score(dataset, model, tokenizer):
    predictions = []
    references = []

    bleu = evaluate.load("bleu")

    for example in tqdm(dataset):
        input_text = example["input"]
        reference = example["target"]

        # Tokenize input and move to model device
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

        # Generate translation
        outputs = model.generate(**inputs, max_new_tokens=128)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        predictions.append(prediction.strip())
        references.append([reference.strip()])  # BLEU expects list of references

    bleu_score = bleu.compute(predictions=predictions, references=references)
    print("BLEU score:", bleu_score["bleu"])
    return bleu_score["bleu"]

In [5]:
bleu_score_msa_lev = get_bleu_score(test_dataset_msa_lev, model, tokenizer)
bleu_score_lev_msa = get_bleu_score(test_dataset_lev_msa, model, tokenizer)

100%|██████████| 1200/1200 [04:09<00:00,  4.81it/s]


BLEU score: 0.0


100%|██████████| 1200/1200 [03:44<00:00,  5.35it/s]


BLEU score: 0.0


In [6]:
import pickle

# Save the BLEU score to a file
with open("Baseline_LEV_MSA_bleu_scores.pkl", "wb") as f:
    pickle.dump({
        "msa_to_lev": bleu_score_msa_lev,
        "lev_to_msa": bleu_score_lev_msa
    }, f)