In [1]:
# requirements.txt
!pip install transformers datasets pandas scikit-learn nltk gradio kagglehub evaluate rouge_score --quiet

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!pip install kaggle



In [4]:
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail


Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
License(s): CC0-1.0
newspaper-text-summarization-cnn-dailymail.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
!unzip newspaper-text-summarization-cnn-dailymail.zip -d cnn_dailymail_dataset


Archive:  newspaper-text-summarization-cnn-dailymail.zip
replace cnn_dailymail_dataset/cnn_dailymail/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace cnn_dailymail_dataset/cnn_dailymail/train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace cnn_dailymail_dataset/cnn_dailymail/validation.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no


In [6]:
# Import only essential libraries
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import gradio as gr
import kagglehub

In [7]:
def load_quick_data():
    """Load only a small sample for quick training"""
    print("Loading small dataset sample...")

    # Load just 500 samples from train.csv
    train_path = "cnn_dailymail_dataset/cnn_dailymail/train.csv"
    df = pd.read_csv(train_path, nrows=200)

    # Show dataset structure
    print(f"Dataset columns: {df.columns.tolist()}")
    print(f"Sample size: {len(df)}")
    print("\nFirst sample:")
    print(f"Article: {df.iloc[0]['article'][:100]}...")
    print(f"Highlights: {df.iloc[0]['highlights']}")

    # Use 400 for training, 100 for testing
    train_df = df.head(150)
    test_df = df.tail(50)

    return train_df, test_df

train_df, test_df = load_quick_data()

Loading small dataset sample...
Dataset columns: ['id', 'article', 'highlights']
Sample size: 200

First sample:
Article: By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 Octob...
Highlights: Bishop John Folda, of North Dakota, is taking time off after being diagnosed .
He contracted the infection through contaminated food in Italy .
Church members in Fargo, Grand Forks and Jamestown could have been exposed .


In [8]:
# Use the smallest T5 model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print("✅ Model loaded successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


✅ Model loaded successfully!


In [9]:
def quick_preprocess(examples):
    """Minimal preprocessing for CNN/DailyMail format"""
    inputs = ["summarize: " + str(doc) for doc in examples['article']]
    targets = [str(doc) for doc in examples['highlights']]

    # Tokenize with minimal settings
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert to datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Quick preprocessing
tokenized_train = train_dataset.map(quick_preprocess, batched=True, batch_size=16)
tokenized_test = test_dataset.map(quick_preprocess, batched=True, batch_size=16)

print("✅ Preprocessing completed!")

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

✅ Preprocessing completed!


In [17]:

# ULTRA-LIGHT TRAINING - NO METRICS DURING TRAINING
training_args = TrainingArguments(
    output_dir="./t5-tiny-cpu",
    overwrite_output_dir=True,

    # Ultra-light settings
    per_device_train_batch_size=1,  # Batch size 1 to save memory
    per_device_eval_batch_size=1,
    dataloader_pin_memory=False,

    # Very quick training
    num_train_epochs=3,  # Only 1 epoch
    max_steps=30,  # Only 30 steps

    # DISABLE evaluation during training to save memory
    # evaluation_strategy="no",  # No evaluation during training
    save_strategy="no",
    logging_steps=5,

    # Force CPU and minimal workers
    no_cuda=True,
    dataloader_num_workers=0,

    # Disable all logging to save memory
    report_to="none",
)

# Simple trainer WITHOUT compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    # REMOVED eval_dataset and compute_metrics to save memory
)

print("🚀 Starting ultra-light training (1 epoch, 30 steps)...")
trainer.train()

# Save the model
trainer.save_model("./t5-tiny-final")
tokenizer.save_pretrained("./t5-tiny-final")
print("✅ Training completed! Model saved.")

  trainer = Trainer(


🚀 Starting ultra-light training (1 epoch, 30 steps)...


Step,Training Loss
5,2.3106
10,1.9788
15,1.9692
20,1.424
25,2.2507
30,1.891


✅ Training completed! Model saved.


In [18]:
def quick_evaluation_after_training():
    """Evaluate after training to avoid memory issues"""
    print("\n🧪 Running quick evaluation...")

    # Test on just 3 samples
    for i in range(min(3, len(test_df))):
        original = test_df.iloc[i]['article']
        reference = test_df.iloc[i]['highlights']

        # Simple generation without metrics
        input_text = "summarize: " + str(original)[:200]
        inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=128, truncation=True)

        summary_ids = model.generate(
            inputs,
            max_length=50,
            num_beams=1,  # Use greedy search to save memory
            early_stopping=True,
        )

        generated = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        print(f"\n📄 Example {i+1}:")
        print(f"Reference: {reference}")
        print(f"Generated: {generated}")
        print("-" * 40)

quick_evaluation_after_training()


🧪 Running quick evaluation...

📄 Example 1:
Reference: Jane Wiggett was found dead in her Cheltenham home on Friday .
Police want to arrest her ex-husband in connection with her murder .
Danny Spencer, 57, is believed to be looking to flee the country .
Generated: Danny Spencer's ex-wife Jane was found dead in her home.
----------------------------------------

📄 Example 2:
Reference: Peggy Drexler: Male models at Rick Owens' Paris runway show wore clothes that exposed their penises .
She says as we become harder to shock, the culture keeps going lower and lower. The bottom is in sight .
Generated: a show in Paris on tuesday. he sent male models down the runway wearing clothes with peepholes.
----------------------------------------

📄 Example 3:
Reference: The Midwest has been hit hard in the recession, but some jobs are available .
CNN affiliates: Offbeat industries like brewing and chocolate making are faring well .
Nursing and health care are also seen as stable .
WRTV: The Census

In [19]:
def simple_summarize(text):
    """Minimal memory summarization"""
    input_text = "summarize: " + str(text)[:200]  # Very short input

    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=128, truncation=True)

    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=1,  # Greedy search
        early_stopping=True,
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Quick demo
def create_simple_demo():
    iface = gr.Interface(
        simple_summarize,
        gr.Textbox(lines=4, placeholder="Enter short text..."),
        gr.Textbox(lines=2, label="Summary"),
        title="Tiny Summarizer",
        examples=[["This is a short example text that will be summarized quickly to demonstrate the model capabilities."]]
    )
    return iface

print("🎯 Launching simple demo...")
demo = create_simple_demo()
demo.launch(share=True)

🎯 Launching simple demo...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://46a0ee64eeadf6b57c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [20]:
# Create a zip file of your trained model
import shutil
import os

def zip_and_download_model():
    """Zip the model folder and provide download link"""
    model_path = "./t5-tiny-final"

    if os.path.exists(model_path):
        # Create zip file
        shutil.make_archive("t5_summarization_model", 'zip', model_path)

        print("✅ Model zipped successfully!")
        print("📁 File: t5_summarization_model.zip")

        # For Google Colab
        try:
            from google.colab import files
            files.download("t5_summarization_model.zip")
            print("📥 Download started automatically in Colab")
        except:
            print("📍 If you're in Colab, the download should start automatically")
            print("📍 If you're in local Jupyter, right-click and download the file")
    else:
        print("❌ Model folder not found. Let me check what's available:")
        print(os.listdir("."))

# Run the download
zip_and_download_model()

✅ Model zipped successfully!
📁 File: t5_summarization_model.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Download started automatically in Colab
