In [None]:
!pip -q install -U transformers datasets evaluate sacrebleu accelerate sentencepiece

In [None]:
!pip -q install sacremoses

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import os, time, json
import numpy as np
from datasets import load_dataset, DatasetDict
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

In [None]:
DATA_ROOT = "/content/drive/MyDrive/dataset_splits_opus100_10k"
OUT_ROOT  = "/content/drive/MyDrive/results_marianmt"
os.makedirs(OUT_ROOT, exist_ok=True)

In [None]:
bleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

In [None]:
PAIR_TO_MODEL = {
    ("en","id"): "Helsinki-NLP/opus-mt-en-id",
    ("id","en"): "Helsinki-NLP/opus-mt-id-en",
    ("en","vi"): "Helsinki-NLP/opus-mt-en-vi",
    ("vi","en"): "Helsinki-NLP/opus-mt-vi-en",
    ("en","ko"): "Helsinki-NLP/opus-mt-tc-big-en-ko",
    ("ko","en"): "Helsinki-NLP/opus-mt-ko-en",
}


In [None]:
PAIR_TO_FOLDER = {
    ("en","id"): "en_id",
    ("id","en"): "en_id",
    ("en","vi"): "en_vi",
    ("vi","en"): "en_vi",
    ("en","ko"): "en_ko",
    ("ko","en"): "en_ko",
}

In [None]:
def load_frozen_split(folder_name: str) -> DatasetDict:
    train_path = os.path.join(DATA_ROOT, folder_name, "train.csv")
    val_path   = os.path.join(DATA_ROOT, folder_name, "val.csv")
    test_path  = os.path.join(DATA_ROOT, folder_name, "test.csv")

    ds_train = load_dataset("csv", data_files=train_path, split="train")
    ds_val   = load_dataset("csv", data_files=val_path, split="train")
    ds_test  = load_dataset("csv", data_files=test_path, split="train")

    return DatasetDict(train=ds_train, validation=ds_val, test=ds_test)

In [None]:
def maybe_swap_columns(ds: DatasetDict, reverse: bool) -> DatasetDict:
    if not reverse:
        return ds
    def _swap(ex):
        return {"source": ex["target"], "target": ex["source"]}
    return DatasetDict(
        train=ds["train"].map(_swap),
        validation=ds["validation"].map(_swap),
        test=ds["test"].map(_swap)
    )

In [None]:
def tokenize_dataset(ds: DatasetDict, tokenizer, max_len=128) -> DatasetDict:
    def _tok(batch):
        model_inputs = tokenizer(
            batch["source"], max_length=max_len, truncation=True
        )
        labels = tokenizer(
              text_target=batch["target"],
              max_length=max_len,
              truncation=True,
          )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    return DatasetDict(
        train=ds["train"].map(_tok, batched=True, remove_columns=ds["train"].column_names),
        validation=ds["validation"].map(_tok, batched=True, remove_columns=ds["validation"].column_names),
        test=ds["test"].map(_tok, batched=True, remove_columns=ds["test"].column_names),
    )

In [None]:
def generate_predictions(model, tokenizer, texts, max_len=128, batch_size=16):
    model.eval()
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
        enc = {k: v.to(model.device) for k, v in enc.items()}
        with torch.no_grad():
            out = model.generate(**enc, max_length=max_len)
        preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return preds

In [None]:
def eval_bleu_chrf(preds, refs):
    bleu_score = bleu.compute(predictions=preds, references=[[r] for r in refs])["score"]
    chrf_score = chrf.compute(predictions=preds, references=[[r] for r in refs])["score"]
    return float(bleu_score), float(chrf_score)

import torch

In [None]:
def run_marianmt_direction(src_lang: str, tgt_lang: str,
                          epochs=2, batch_size=16, lr=5e-5,
                          max_len=128, sample_n=10):
    assert (src_lang, tgt_lang) in PAIR_TO_MODEL

    model_name = PAIR_TO_MODEL[(src_lang, tgt_lang)]
    folder_name = PAIR_TO_FOLDER[(src_lang, tgt_lang)]
    reverse = (src_lang != "en")  # because saved datasets are EN->X canonical

    run_id = f"{src_lang}_to_{tgt_lang}_marianmt"
    out_dir = os.path.join(OUT_ROOT, run_id)
    os.makedirs(out_dir, exist_ok=True)

    # ---- Load frozen data ----
    ds = load_frozen_split(folder_name)
    ds = maybe_swap_columns(ds, reverse=reverse)

    # ---- Load model/tokenizer ----
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model = model.float()
    model.config.use_cache = False

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # ---- Baseline inference on test set ----
    test_src = ds["test"]["source"]
    test_ref = ds["test"]["target"]

    t0 = time.time()
    baseline_preds = []
    for i in range(0, len(test_src), batch_size):
        batch = test_src[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model.generate(**enc, max_length=max_len)
        baseline_preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    baseline_time = time.time() - t0

    baseline_bleu, baseline_chrf = eval_bleu_chrf(baseline_preds, test_ref)

    # ---- Tokenize for fine-tuning ----
    tok_ds = tokenize_dataset(ds, tokenizer, max_len=max_len)
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    # ---- Trainer ---
    args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(out_dir, "checkpoints"),
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=1,

    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    fp16=False,
    bf16=False,
    report_to="none",
    seed=42,

    label_smoothing_factor=0.1,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=tok_ds["train"],
        eval_dataset=tok_ds["validation"],
        data_collator=data_collator,
    )

    # ---- Fine-tune ----
    train_t0 = time.time()
    train_output = trainer.train()
    train_time = time.time() - train_t0

    model = trainer.model
    model.eval()

    # ---- After-FT inference on test set ----
    t1 = time.time()
    finetuned_preds = []
    for i in range(0, len(test_src), batch_size):
        batch = test_src[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model.generate(**enc, max_length=max_len)
        finetuned_preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    finetuned_time = time.time() - t1

    finetuned_bleu, finetuned_chrf = eval_bleu_chrf(finetuned_preds, test_ref)

    # ---- Save metrics ----
    gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu"
    n_params = sum(p.numel() for p in model.parameters())

    metrics = {
        "direction": f"{src_lang}->{tgt_lang}",
        "model": model_name,
        "dataset_folder": folder_name,
        "reverse_columns_used": bool(reverse),
        "max_len": max_len,
        "epochs": epochs,
        "batch_size": batch_size,
        "learning_rate": lr,
        "gpu": gpu_name,
        "n_params": int(n_params),
        "baseline": {
            "bleu": baseline_bleu,
            "chrf": baseline_chrf,
            "inference_time_sec": float(baseline_time),
        },
        "finetuned": {
            "bleu": finetuned_bleu,
            "chrf": finetuned_chrf,
            "inference_time_sec": float(finetuned_time),
        },
        "train_time_sec": float(train_time),
        "trainer_log_history": trainer.state.log_history,
    }

    with open(os.path.join(out_dir, "metrics.json"), "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2)

    # ---- Save sample outputs for error analysis ----
    idx = np.linspace(0, len(test_src)-1, num=min(sample_n, len(test_src)), dtype=int).tolist()
    samples = []
    for k in idx:
        samples.append({
            "source": test_src[k],
            "reference": test_ref[k],
            "before_ft": baseline_preds[k],
            "after_ft": finetuned_preds[k],
        })

    import pandas as pd
    pd.DataFrame(samples).to_csv(os.path.join(out_dir, "samples_before_after.csv"), index=False, encoding="utf-8")

    # Save final model (optional, but useful)
    model.save_pretrained(os.path.join(out_dir, "final_model"))
    tokenizer.save_pretrained(os.path.join(out_dir, "final_model"))

    print(f"Done {src_lang}->{tgt_lang}")
    print(f"Baseline:  BLEU={baseline_bleu:.2f}, chrF={baseline_chrf:.2f}")
    print(f"Fine-tuned: BLEU={finetuned_bleu:.2f}, chrF={finetuned_chrf:.2f}")
    print(f"Saved to: {out_dir}")

    return metrics

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "Helsinki-NLP/opus-mt-tc-big-en-ko"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda").eval()

texts = [
    "Are you seeing anyone?",
    "We need to find cover now.",
    "I can't believe you did that.",
    "This is not what I expected.",
    "Where are you going tonight?"
]

enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=96).to("cuda")
out = model.generate(**enc, max_length=96, num_beams=4)
print(tokenizer.batch_decode(out, skip_special_tokens=True))

In [None]:
directions = [("en","id"), ("id","en"),
              ("en","vi"), ("vi","en"),
              ("en","ko"), ("ko","en")]

all_metrics = []
for src, tgt in directions:
    all_metrics.append(run_marianmt_direction(src, tgt, epochs=1, batch_size=8, lr=1e-5, max_len=96))