In [None]:
import os
import pandas as pd
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from peft import PeftModel

os.environ["HF_TOKEN"] = "YOUR_HF_TOKEN"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LENGTH = 256

In [None]:
peft_model_id = "./madlad-lora-final"
base_model_name = "jbochi/madlad400-3b-mt"

base_model = T5ForConditionalGeneration.from_pretrained(
    base_model_name, dtype=torch.bfloat16, device_map=None
).to(device)
tokenizer = T5Tokenizer.from_pretrained(peft_model_id)
model = PeftModel.from_pretrained(base_model, peft_model_id)

In [None]:
def translate(text, model, tokenizer, tgt_lang):
    inputs = tokenizer(f"<2{tgt_lang}> {text}", return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs, max_length=MAX_LENGTH, num_beams=5, early_stopping=True
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

In [None]:
bouquet_test = pd.read_parquet("bouquet_test_ce.parquet")

In [None]:
bouquet_test["ce2ru_madlad"] = bouquet_test.ce.map(
    lambda x: translate(x, model, tgt_lang="ru")
)
bouquet_test["ru2ce_madlad"] = bouquet_test.ru.map(
    lambda x: translate(x, model, tgt_lang="ce")
)
bouquet_test["ce2en_madlad"] = bouquet_test.ce.map(
    lambda x: translate(x, model, tgt_lang="en")
)
bouquet_test["en2ce_madlad"] = bouquet_test.en.map(
    lambda x: translate(x, model, tgt_lang="ce")
)

In [None]:
bouquet_test.to_parquet("bouquet_test_ce_overall.parquet")