In [1]:
!pip install -q transformers datasets evaluate sentencepiece accelerate rouge_score streamlit gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0

In [8]:


import os
import random
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
import evaluate


MODEL_NAME = "t5-base"  
OUTPUT_DIR = "models/t5_summarization"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128
TRAIN_BATCH = 4
EVAL_BATCH = 4
NUM_EPOCHS = 3
LR = 5e-5
SEED = 42
DATASET_SAMPLE_SIZE = 0.05  

TRAIN_CSV = "/kaggle/input/cnndata/train.csv"
VAL_CSV = "/kaggle/input/cnndata/validation.csv"
TEST_CSV = "/kaggle/input/cnndata/test.csv"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


Device: cuda


In [10]:

for path in [TRAIN_CSV, VAL_CSV, TEST_CSV]:
    print("Path:", path, "Exists?", os.path.exists(path))

train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)
test_df  = pd.read_csv(TEST_CSV)

print("\nTrain shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

print("\nTrain columns:", train_df.columns.tolist())
train_df.head(3)


Path: /kaggle/input/cnndata/train.csv Exists? True
Path: /kaggle/input/cnndata/validation.csv Exists? True
Path: /kaggle/input/cnndata/test.csv Exists? True

Train shape: (287113, 3)
Validation shape: (13368, 3)
Test shape: (11490, 3)

Train columns: ['id', 'article', 'highlights']


Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."


In [11]:

def df_to_datasetdict(train_df, val_df, test_df):
    ds_train = Dataset.from_pandas(train_df.reset_index(drop=True))
    ds_val   = Dataset.from_pandas(val_df.reset_index(drop=True))
    ds_test  = Dataset.from_pandas(test_df.reset_index(drop=True))
    return DatasetDict({"train": ds_train, "validation": ds_val, "test": ds_test})

dataset = df_to_datasetdict(train_df, val_df, test_df)

print("Original sizes:")
for s, d in dataset.items():
    print(f"  {s}: {len(d)}")

def sample_dataset(dataset: DatasetDict, frac: float, seed: int = SEED):
    if frac >= 1.0:
        return dataset
    sampled = {}
    for split, ds in dataset.items():
        n = max(1, int(len(ds) * frac))
        sampled[split] = ds.shuffle(seed=seed).select(range(n))
        print(f"Sampled {len(sampled[split])}/{len(ds)} from {split}")
    return DatasetDict(sampled)

dataset = sample_dataset(dataset, DATASET_SAMPLE_SIZE)


Original sizes:
  train: 287113
  validation: 13368
  test: 11490
Sampled 14355/287113 from train
Sampled 668/13368 from validation
Sampled 574/11490 from test


In [12]:
def preprocess_examples(examples, tokenizer, model_name=MODEL_NAME):
    inputs = examples.get("article") or examples.get("text") or examples.get("content")
    targets = examples.get("highlights") or examples.get("summary") or examples.get("target")
    if inputs is None or targets is None:
        raise ValueError("error")
    if "t5" in model_name.lower():
        inputs = ["summarize: " + str(x).strip() for x in inputs]
    else:
        inputs = [str(x).strip() for x in inputs]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
    labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
print("Loaded:", MODEL_NAME)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loaded: t5-base


In [14]:
tokenized_datasets = dataset.map(
    lambda examples: preprocess_examples(examples, tokenizer),
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing dataset",
)

print(tokenized_datasets)


Tokenizing dataset:   0%|          | 0/14355 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/668 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/574 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14355
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 668
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 574
    })
})


In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in lab] for lab in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v, 4) for k, v in result.items()}
    gen_lens = [len(tokenizer.encode(p)) for p in decoded_preds]
    result["gen_len"] = round(sum(gen_lens) / max(1, len(gen_lens)), 2)
    return result


Downloading builder script: 0.00B [00:00, ?B/s]

In [21]:

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="steps",           
    eval_steps=500 if len(tokenized_datasets["train"]) > 500 else 100,
    save_strategy="steps",
    save_steps=500 if len(tokenized_datasets["train"]) > 500 else 100,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    predict_with_generate=True,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LR,
    logging_steps=100,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=4,
    seed=SEED,
    report_to=[],
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer initialized successfully ✅")


Trainer initialized successfully ✅


  trainer = Seq2SeqTrainer(


In [22]:
train_result = trainer.train()
trainer.save_model(OUTPUT_DIR)
print("Training finished. Model saved to:", OUTPUT_DIR)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,1.5571,1.579499,0.2528,0.1213,0.207,0.2072,20.87
1000,1.5099,1.57727,0.2539,0.1229,0.2092,0.2093,20.87


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training finished. Model saved to: models/t5_summarization


In [23]:
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print("Test metrics (ROUGE):")
for k, v in test_metrics.items():
    print(f"  {k}: {v}")




Test metrics (ROUGE):
  eval_loss: 1.5522174835205078
  eval_rouge1: 0.2685
  eval_rouge2: 0.1309
  eval_rougeL: 0.2206
  eval_rougeLsum: 0.2206
  eval_gen_len: 20.87
  eval_runtime: 81.6292
  eval_samples_per_second: 7.032
  eval_steps_per_second: 0.882
  epoch: 3.0


In [26]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_PATH = OUTPUT_DIR
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
print("Model ready ✅")

def generate_summary(text, max_length, min_length, beam_width):
    if not text.strip():
        return "Please enter some text to summarize."
    input_text = "summarize: " + text.strip()
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=max_length,
            min_length=min_length,
            num_beams=beam_width,
            early_stopping=True,
            no_repeat_ngram_size=3,
            length_penalty=2.0
        )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

with gr.Blocks(title="Text Summarizer") as demo:
    gr.Markdown("# ✏️ Text Summarization Demo")
    gr.Markdown("Fine-tuned T5/BART model that generates short summaries from long text.")
    with gr.Row():
        with gr.Column(scale=3):
            input_text = gr.Textbox(label="Enter your text:", lines=10, placeholder="Paste your article here...")
            summarize_btn = gr.Button("Generate Summary")
            output_summary = gr.Textbox(label="Summary:", lines=6)
            gr.Examples(
                examples=[
                    ["The global economy is facing challenges due to rising inflation and policy changes."],
                    ["AI is transforming industries, healthcare, and automation worldwide."],
                    ["Pakistan won the cricket match after a thrilling chase led by Babar Azam."]
                ],
                inputs=input_text
            )
        with gr.Column(scale=1):
            gr.Markdown("### Settings")
            max_length = gr.Slider(50, 200, value=100, step=10, label="Max Length")
            min_length = gr.Slider(10, 60, value=20, step=5, label="Min Length")
            beam_width = gr.Slider(1, 6, value=4, step=1, label="Beam Width")
    summarize_btn.click(fn=generate_summary, inputs=[input_text, max_length, min_length, beam_width], outputs=output_summary)

print("\nLaunching Gradio App...")
demo.launch(share=True)


Loading model...
Model ready ✅

Launching Gradio App...
* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://6de5f7f3040f766edd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


