In [None]:
!nvidia-smi
import torch
print("CUDA available:", torch.cuda.is_available())

In [None]:
!pip -q install -U \
  "pandas==2.2.2" \
  "numpy>=2.0,<2.1" \
  "pyarrow>=15,<18" \
  "huggingface_hub>=0.33.5,<2.0" \
  "datasets==3.6.0" \
  "transformers>=4.41.0" \
  "accelerate>=0.33.0" \
  "evaluate>=0.4.2" \
  "rouge-score>=0.1.2" \
  "bert-score>=0.3.13" \
  "sentencepiece>=0.2.0" \
  "sacrebleu>=2.4.0"


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
import os
os.kill(os.getpid(), 9)


In [None]:
import os, time, random
import numpy as np
import torch

from datasets import load_dataset, concatenate_datasets, get_dataset_config_names
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import evaluate

os.environ["FSSPEC_HTTP_TIMEOUT"] = "3600"
os.environ["HF_DATASETS_OFFLINE"] = "0"

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DATA_SOURCES = [
    ("GEM/wiki_lingua", "en", "en", "train"),
    ("GEM/wiki_lingua", "de", "de", "train"),
    ("GEM/wiki_lingua", "fr", "fr", "train"),
    ("mlsum", "de", "de", "train"),
    ("mlsum", "fr", "fr", "train"),
    ("csebuetnlp/xlsum", "english", "en", "train"),
    ("csebuetnlp/xlsum", "french", "fr", "train"),
]

MAX_PER_SOURCE = 600
TEST_SIZE = 0.05

MIN_DOC_CHARS = 80
MIN_SUM_CHARS = 10

PREFIX = "summarize"
MAX_INPUT_LENGTH = 384
MAX_TARGET_LENGTH = 96

MODEL_NAME = "google/mt5-small"
OUT_DIR = "./mt5-multilingual-summarizer"
FINAL_DIR = "./mt5-multilingual-summarizer-final"

In [None]:
def safe_load_dataset(name, config=None, split="train", max_retries=3):
    for attempt in range(max_retries):
        try:
            ds = load_dataset(
                name,
                config,
                split=split,
                trust_remote_code=True,
                download_mode="reuse_dataset_if_exists",
                verification_mode="no_checks",
            )
            print(f"Loaded {name}/{config}")
            return ds
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            time.sleep(10)

In [None]:
def config_supported(dataset_name, config_name):
    try:
        return config_name in get_dataset_config_names(dataset_name)
    except Exception:
        return True

In [None]:
def standardize_row(example):
    doc = (
        example.get("text")
        or example.get("article")
        or example.get("source")
        or example.get("document")
    )
    summ = (
        example.get("summary")
        or example.get("target")
        or example.get("highlights")
    )
    return {"document": doc, "summary": summ}

In [None]:
def is_valid(example):
    d, s = example["document"], example["summary"]
    return (
        isinstance(d, str) and isinstance(s, str)
        and len(d) >= MIN_DOC_CHARS
        and len(s) >= MIN_SUM_CHARS
    )

In [None]:
def add_lang(example, lang):
    example["lang"] = lang
    return example

In [None]:
datasets_list = []

for ds_name, cfg, lang, split in DATA_SOURCES:
    if not config_supported(ds_name, cfg):
        continue

    ds = safe_load_dataset(ds_name, cfg, split)
    ds = ds.select(range(min(len(ds), MAX_PER_SOURCE)))
    ds = ds.map(standardize_row, remove_columns=ds.column_names)
    ds = ds.filter(is_valid)
    ds = ds.map(lambda x: add_lang(x, lang))

    datasets_list.append(ds)

min_len = min(len(ds) for ds in datasets_list)
datasets_list = [ds.shuffle(seed=SEED).select(range(min_len)) for ds in datasets_list]

full = concatenate_datasets(datasets_list).shuffle(seed=SEED)
print("Total examples:", len(full))

In [None]:
splits = full.train_test_split(test_size=TEST_SIZE, seed=SEED)
train_ds = splits["train"]
eval_ds = splits["test"]

print("Train:", len(train_ds), "Eval:", len(eval_ds))