Verifikimi i perdorimit te GPU NVIDIA CUDA

In [None]:
!nvidia-smi
import torch
print("CUDA available:", torch.cuda.is_available())

Instalimi i paketave te nevojshme per projektin:

In [None]:
!pip -q install -U \
  "pandas==2.2.2" \
  "numpy>=2.0,<2.1" \
  "pyarrow>=15,<18" \
  "huggingface_hub>=0.33.5,<2.0" \
  "datasets==3.6.0" \
  "transformers>=4.41.0" \
  "accelerate>=0.33.0" \
  "evaluate>=0.4.2" \
  "rouge-score>=0.1.2" \
  "bert-score>=0.3.13" \
  "sentencepiece>=0.2.0" \
  "sacrebleu>=2.4.0"


Rifreskim i mjedisit punues pas instalimit te paketave te projektit

In [None]:
import os
os.kill(os.getpid(), 9)

Pergaditja e listes se burimeve te te dhenava dhe modelit

In [None]:
import os, time, random
import numpy as np
import torch

from datasets import load_dataset, concatenate_datasets, get_dataset_config_names
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import evaluate

os.environ["FSSPEC_HTTP_TIMEOUT"] = "3600"
os.environ["HF_DATASETS_OFFLINE"] = "0"

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DATA_SOURCES = [
    ("GEM/wiki_lingua", "en", "en", "train"),
    ("GEM/wiki_lingua", "de", "de", "train"),
    ("GEM/wiki_lingua", "fr", "fr", "train"),
    ("mlsum", "de", "de", "train"),
    ("mlsum", "fr", "fr", "train"),
    ("csebuetnlp/xlsum", "english", "en", "train"),
    ("csebuetnlp/xlsum", "french", "fr", "train"),
]

MAX_PER_SOURCE = 600
TEST_SIZE = 0.05

MIN_DOC_CHARS = 80
MIN_SUM_CHARS = 10

PREFIX = "summarize"
MAX_INPUT_LENGTH = 384
MAX_TARGET_LENGTH = 96

MODEL_NAME = "google/mt5-small"
OUT_DIR = "./mt5-multilingual-summarizer"
FINAL_DIR = "./mt5-multilingual-summarizer-final"

Funksioni per shkarkimin e burimeve te te dhenava dhe standardizimi i te dhenave:

In [None]:
def safe_load_dataset(name, config=None, split="train", max_retries=3):
    for attempt in range(max_retries):
        try:
            ds = load_dataset(
                name,
                config,
                split=split,
                trust_remote_code=True,
                download_mode="reuse_dataset_if_exists",
                verification_mode="no_checks",
            )
            print(f"Loaded {name}/{config}")
            return ds
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            time.sleep(10)

In [None]:
def config_supported(dataset_name, config_name):
    try:
        return config_name in get_dataset_config_names(dataset_name)
    except Exception:
        return True

In [None]:
def standardize_row(example):
    doc = (
        example.get("text")
        or example.get("article")
        or example.get("source")
        or example.get("document")
    )
    summ = (
        example.get("summary")
        or example.get("target")
        or example.get("highlights")
    )
    return {"document": doc, "summary": summ}

In [None]:
def is_valid(example):
    d, s = example["document"], example["summary"]
    return (
        isinstance(d, str) and isinstance(s, str)
        and len(d) >= MIN_DOC_CHARS
        and len(s) >= MIN_SUM_CHARS
    )

In [None]:
def add_lang(example, lang):
    example["lang"] = lang
    return example

Pergaditja e listes se dataseteve

In [None]:
datasets_list = []

for ds_name, cfg, lang, split in DATA_SOURCES:
    if not config_supported(ds_name, cfg):
        continue

    ds = safe_load_dataset(ds_name, cfg, split)
    ds = ds.select(range(min(len(ds), MAX_PER_SOURCE)))
    ds = ds.map(standardize_row, remove_columns=ds.column_names)
    ds = ds.filter(is_valid)
    ds = ds.map(lambda x: add_lang(x, lang))

    datasets_list.append(ds)

min_len = min(len(ds) for ds in datasets_list)
datasets_list = [ds.shuffle(seed=SEED).select(range(min_len)) for ds in datasets_list]

full = concatenate_datasets(datasets_list).shuffle(seed=SEED)
print("Total examples:", len(full))

Ndarja e dataseteve per testim dhe evaluim

In [None]:
splits = full.train_test_split(test_size=TEST_SIZE, seed=SEED)
train_ds = splits["train"]
eval_ds = splits["test"]

print("Train:", len(train_ds), "Eval:", len(eval_ds))

Pergaditja e tokenizuesit

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    inputs = [f"{PREFIX} ({l}): {d}" for l, d in zip(batch["lang"], batch["document"])]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding=False,
    )

    labels = tokenizer(
        text_target=batch["summary"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding=False,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
eval_tok  = eval_ds.map(preprocess,  batched=True, remove_columns=eval_ds.column_names)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=MODEL_NAME,
    label_pad_token_id=-100,
)

print("Tokenized:", len(train_tok), len(eval_tok))

Trajnimi i modelit me parametrat e caktuar

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to("cuda")

training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    eval_strategy="steps",
    eval_steps=250,
    save_steps=250,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

    learning_rate=1e-5,
    warmup_ratio=0.05,
    max_grad_norm=0.5,

    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,

    fp16=False,
    logging_steps=25,
    report_to=[],
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Ruajtja e modelit per perdorime te ardhshme

In [None]:
trainer.save_model(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)
print("Saved to:", FINAL_DIR)

In [None]:
Testimi i modelit pas trajnimit me metrikat ROUGE dhe BERTScore

In [None]:
import torch
import evaluate
from tqdm import tqdm
from torch.utils.data import DataLoader

print("Loading metrics...")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

eval_batch_size = 4
test_loader = DataLoader(eval_tok, batch_size=eval_batch_size, shuffle=False, collate_fn=data_collator)

print("Starting generation on test set...")
model.eval()

predictions = []
references = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = {k: v.to("cuda") for k, v in batch.items()}
                generated_ids = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=64,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True,
            no_repeat_ngram_size=3
        )
        
        decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        labels = batch["labels"].cpu().numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        predictions.extend(decoded_preds)
        references.extend(decoded_labels)

print("\nComputing ROUGE...")
rouge_results = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

print("\nComputing BERTScore (this takes a moment)...")
bert_results = bertscore.compute(
    predictions=predictions, 
    references=references, 
    lang="en",
    model_type="distilbert-base-multilingual-cased", 
    verbose=True
)

print(f"BERTScore F1 (Mean): {np.mean(bert_results['f1']):.4f}")

Gjenerimi i permbledhjeve ne test cases te ndryshme

In [None]:
import re
import torch
from transformers import MT5ForConditionalGeneration, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MT5ForConditionalGeneration.from_pretrained(FINAL_DIR).to(device)
model.eval()

tok = AutoTokenizer.from_pretrained(FINAL_DIR, use_fast=True)

TRAIN_PREFIX = "summarize"

def capitalize_first_letter(text: str) -> str:
    for i, c in enumerate(text):
        if c.isalpha():
            return text[:i] + c.upper() + text[i+1:]
    return text

def summarize(text, num_beams=6, max_new_tokens=96):
    inputs = tok(
        TRAIN_PREFIX + text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_LENGTH,
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            num_beams=num_beams,
            max_new_tokens=max_new_tokens,
            min_new_tokens=20,
            length_penalty=1.1,
            repetition_penalty=1.1,
            no_repeat_ngram_size=3,
            early_stopping=True,
        )

    s = tok.decode(ids[0], skip_special_tokens=True)

    s = s.replace("▁", " ")
    s = re.sub(r"<extra_id_\d+>", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"^(in English:|en français:|auf Deutsch zusammen:)\s*", "", s, flags=re.IGNORECASE).strip()

    return capitalize_first_letter(s)

test_cases = [
    ("EN", """Researchers at a university lab reported a new battery design that charges faster and lasts longer.
They said the prototype uses a modified electrolyte that reduces degradation. Independent testing is still limited,
but early results suggest improved cycle life and better performance in cold temperatures."""),

    ("EN", """A city council debated a new transportation plan that would add dedicated bus lanes, expand bike routes,
and increase parking fees downtown. Supporters argued it would reduce traffic and emissions, while critics warned
about impacts on small businesses and push traffic into residential areas."""),

    ("FR", """Selon un rapport récent, la hausse des températures aggrave les épisodes de pollution dans plusieurs grandes villes.
Les experts recommandent de renforcer les transports publics, de limiter la circulation lors des pics et d’améliorer
la surveillance de la qualité de l’air."""),

    ("FR", """Une entreprise a annoncé le lancement d’un service de traduction automatique destiné aux PME.
Le produit promet une meilleure prise en compte du contexte et des expressions idiomatiques, mais certains spécialistes
rappellent que les erreurs restent possibles dans les domaines juridiques et médicaux."""),

    ("DE", """Mehrere Regionen meldeten ungewöhnlich hohe Temperaturen und eine Zunahme von Starkregenereignissen.
Forschende erklären, dass wärmere Luft mehr Feuchtigkeit speichern kann, was die Intensität von Niederschlägen erhöht.
Kommunen investieren in bessere Entwässerung, Hitzeschutzpläne und Frühwarnsysteme."""),

    ("DE", """Ein großer Einzelhändler kündigte an, Filialen umzubauen, um mehr Platz für Abholstationen und Rücksendungen zu schaffen.
Das Unternehmen reagiert damit auf den wachsenden Onlinehandel. Kritiker befürchten längere Wartezeiten durch weniger Personal."""),

    ("EN", """Cybersecurity firms have observed a sharp rise in phishing attacks targeting remote workers via SMS and social media.
Hackers are using AI tools to craft convincing messages that mimic company executives or IT support.
Experts advise corporations to implement stricter multi-factor authentication and regular employee training to combat this threat."""),

    ("EN", """The housing market has cooled significantly as central banks raised interest rates to combat inflation.
Real estate agents report that homes are sitting on the market longer, and price reductions are becoming common.
While this makes buying harder for first-time owners due to mortgage costs, it may eventually stabilize skyrocketing property values."""),

    ("EN", """Biologists are sounding the alarm over the rapid decline of wild bee populations due to pesticide use and habitat loss.
Since bees are responsible for pollinating a third of the food we eat, their disappearance could threaten global food security.
Conservationists are urging farmers to plant wildflower strips and reduce chemical usage to support pollinators."""),

    ("EN", """Major streaming services are introducing ad-supported subscription tiers and cracking down on password sharing to boost revenue.
This shift comes as the market becomes saturated and production costs for original content soar.
Subscribers have expressed mixed reactions, with some welcoming cheaper options while others threaten to cancel memberships."""),

    ("FR", """Le projet du Grand Paris Express, qui prévoit l'extension massive du réseau de métro, transforme déjà la banlieue parisienne.
De nouvelles gares sortent de terre, attirant les promoteurs immobiliers et faisant grimper les prix des logements aux alentours.
Les élus locaux espèrent que ce réseau réduira les inégalités territoriales en désenclavant des quartiers jusqu'ici mal desservis."""),

    ("FR", """La consommation de produits biologiques a enregistré une baisse inédite cette année, frappée par l'inflation alimentaire.
Les consommateurs, soucieux de leur pouvoir d'achat, se tournent vers des alternatives moins coûteuses ou les marques de distributeurs.
Les agriculteurs bio demandent une aide d'urgence à l'État pour éviter des faillites en cascade dans la filière."""),

    ("FR", """Le festival de Cannes a ouvert ses portes dans un climat de polémique concernant la place des plateformes de streaming au cinéma.
Alors que certains réalisateurs défendent l'expérience unique de la salle obscure, d'autres estiment que le streaming permet de financer des œuvres audacieuses.
Le jury devra trancher entre tradition et modernité lors de la remise de la Palme d'or."""),

    ("DE", """Die Debatte über die Vier-Tage-Woche gewinnt in Deutschland an Fahrt, da Pilotprojekte positive Ergebnisse zeigen.
Befürworter argumentieren, dass weniger Arbeitszeit die Produktivität steigert und krankheitsbedingte Ausfälle reduziert.
Arbeitgeberverbände warnen jedoch, dass dies in Zeiten des Fachkräftemangels die Wirtschaft schwächen und Kosten erhöhen könnte."""),

    ("DE", """Wegen zahlreicher Baustellen und technischer Störungen hat die Deutsche Bahn erneut ihre Pünktlichkeitsziele verfehlt.
Der Konzern kündigte an, das Schienennetz in den kommenden Jahren grundlegend zu sanieren, was zunächst zu noch mehr Sperrungen führen wird.
Verkehrsminister fordern ein besseres Baustellenmanagement, um die Geduld der Fahrgäste nicht überzustrapazieren."""),

    ("DE", """Der Trend zu sogenannten Balkonkraftwerken boomt, da immer mehr Mieter ihren eigenen Solarstrom produzieren wollen.
Vereinfachte bürokratische Regeln und sinkende Preise für Solarmodule haben die Nachfrage sprunghaft ansteigen lassen.
Energieexperten sehen darin einen wichtigen, wenn auch kleinen, Baustein für die Energiewende in privaten Haushalten."""),

    ("FR", """Les viticulteurs du sud de la France font face à des vendanges de plus en plus précoces en raison du réchauffement climatique.
La chaleur intense modifie le taux de sucre et l'acidité du raisin, obligeant les producteurs à adapter leurs techniques de vinification.
Certains envisagent même de planter de nouvelles variétés de cépages plus résistantes à la sécheresse."""),

    ("DE", """Archäologen haben in Bayern ein gut erhaltenes Schwert aus der Bronzezeit entdeckt, das über 3000 Jahre alt ist.
Der Fund gilt als Sensation, da die Waffe noch ihren metallischen Glanz besitzt und reich verziert ist.
Historiker erhoffen sich dadurch neue Erkenntnisse über die Handwerkskunst und Handelsbeziehungen der damaligen Zeit."""),

]

for lang, doc in test_cases:
    print(f"\n[{lang}]")
    print("DOCUMENT:", doc)
    print("SUMMARY:", summarize(doc))