In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1. Install necessary libraries
!pip install transformers datasets sentencepiece accelerate -U
!pip install evaluate rouge_score sacrebleu

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import evaluate
from google.colab import files
import re
from typing import Dict, List

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m140.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling c

In [None]:
import pandas as pd
from datasets import Dataset
import re # Make sure 're' is imported in a previous cell

def load_data_from_local():
    """Loads files directly from the Colab local disk, assuming they are already uploaded."""

    # Check for files and load them
    try:
        df1 = pd.read_csv('news_summary.csv', encoding='iso-8859-1')
        df2 = pd.read_csv('news_summary_more.csv', encoding='iso-8859-1')
        print("Data loaded successfully from local directory.")
    except FileNotFoundError as e:
        print(f"Error: {e}. Please ensure both files are uploaded to the root Colab directory.")
        raise

    return df1, df2

def preprocess_and_merge(df_summary, df_more):
    """
    Cleans, selects columns based on the file-specific structure, and merges the two datasets.
    (This function remains unchanged from your original code.)
    """

    # 1. Standardize columns for news_summary
    df_summary = df_summary[['ctext', 'text']]
    df_summary.columns = ['article', 'headline']

    # 2. Standardize columns for news_summary_more
    df_more = df_more[['text', 'headlines']]
    df_more.columns = ['article', 'headline']

    # 3. Combine and Clean
    df_combined = pd.concat([df_summary, df_more], ignore_index=True)

    def clean_text(text):
        if isinstance(text, str):
            text = text.lower()
            text = re.sub(r'http\S+', '', text)
            text = re.sub(r'[^a-z0-9\s.,]', '', text)
            return text.strip()
        return ""

    df_combined['article'] = df_combined['article'].apply(clean_text)
    df_combined['headline'] = df_combined['headline'].apply(clean_text)

    # Filtering and renaming
    df_combined.drop_duplicates(subset=['article', 'headline'], inplace=True)
    df_combined.dropna(inplace=True)
    df_combined = df_combined[df_combined['article'].str.len() > 50]
    df_combined = df_combined[df_combined['headline'].str.len() > 10]

    # Rename to 'summary' for model compatibility
    df_combined = df_combined.rename(columns={'headline': 'summary'})

    return df_combined

# Execute Data Loading and Preprocessing
# CHANGE THIS LINE: Call the new silent loading function
df1, df2 = load_data_from_local()
df_processed = preprocess_and_merge(df1, df2)

# Convert to Hugging Face Dataset and split
dataset = Dataset.from_pandas(df_processed)
dataset = dataset.train_test_split(test_size=0.05, seed=42)

print(f"\n--- Data Processing Complete ---")
print(f"Total processed samples: {len(df_processed)}")
print(f"Training samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")
print("Sample Article and Headline:")
print(f"Article: {dataset['train'][0]['article'][:100]}...")
print(f"Headline: {dataset['train'][0]['summary']}")

Data loaded successfully from local directory.

--- Data Processing Complete ---
Total processed samples: 102768
Training samples: 97629
Test samples: 5139
Sample Article and Headline:
Article: the kimpton de witt hotel in amsterdam has trained staff members on how to capture instagramready ph...
Headline: hotel trains staff on how to take instagram photos of guests


In [None]:
# --- Tokenizer and Model Loading ---
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# --- Tokenization Parameters ---
max_input_length = 512
max_target_length = 64

def preprocess_function(examples):
    # T5 requires the input to be prefixed
    inputs = [f"summarize: {text}" for text in examples["article"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    # Tokenize labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )

    # Use -100 as padding token id for labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['article', 'summary', '__index_level_0__'])

print("\n--- Data Tokenization Complete ---")
print("Tokenized training data sample keys:", tokenized_datasets["train"][0].keys())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/97629 [00:00<?, ? examples/s]



Map:   0%|          | 0/5139 [00:00<?, ? examples/s]


--- Data Tokenization Complete ---
Tokenized training data sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [6]:
# Cell 4: Model Setup, Metrics, and Training (Corrected to RESUME from checkpoint)

from transformers import DataCollatorForSeq2Seq

# --- PERSISTENT GOOGLE DRIVE PATH DEFINITION ---
DRIVE_PATH_BASE = '/content/drive/MyDrive/T5_Headline_Project/'
MODEL_OUTPUT_DIR = DRIVE_PATH_BASE + "t5_headline_generator_final"
LOGS_DIR = DRIVE_PATH_BASE + "logs_final"

# --- CHECKPOINT PATH DEFINITION ---
# You MUST verify this path in your Google Drive file browser.
# The structure is usually: output_dir/checkpoint-XXXXX
LAST_CHECKPOINT_PATH = '/content/drive/MyDrive/T5_Headline_Project/t5_headline_generator_final/checkpoint-24408'
# Assuming 'checkpoint-24408' is the last saved checkpoint *folder*


# --- Model Loading ---
# NOTE: The model is loaded as a base model here, but the resume_from_checkpoint
# argument will correctly load the weights from your drive folder during trainer.train().
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# --- Metrics Loading and Computation Function (Unchanged) ---
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred: tuple) -> Dict[str, float]:
    """Computes ROUGE and SacreBLEU scores."""
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    vocab_size = tokenizer.vocab_size
    predictions = np.where(
        (predictions < 0) | (predictions >= vocab_size),
        tokenizer.pad_token_id,
        predictions
    )
    predictions = predictions.astype(np.int64)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
    decoded_labels = [["\n".join(label.strip().split())] for label in decoded_labels]
    rouge_results = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_results = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    results = {
        "rouge1_fmeasure": rouge_results["rouge1"],
        "rouge2_fmeasure": rouge_results["rouge2"],
        "rougeL_fmeasure": rouge_results["rougeL"],
        "sacrebleu": bleu_results["score"]
    }
    return results

# --- Training Arguments and Trainer Setup ---
training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOGS_DIR,
    logging_steps=500,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    report_to=["none"]
)

# Initialize the Data Collator and Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# --- Fine-tuning the Model (Resume from Checkpoint) ---
print("\n--- Resuming Model Fine-Tuning ---")

# Pass the path to the latest checkpoint to resume training
trainer.train(resume_from_checkpoint=LAST_CHECKPOINT_PATH)

# Save the final fine-tuned model and tokenizer
FINAL_SAVE_PATH = DRIVE_PATH_BASE + "final_model_weights"
model.save_pretrained(FINAL_SAVE_PATH)
tokenizer.save_pretrained(FINAL_SAVE_PATH)

print("\n--- Model Training Complete and Saved to Google Drive ---")

  trainer = Seq2SeqTrainer(



--- Resuming Model Fine-Tuning ---


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1 Fmeasure,Rouge2 Fmeasure,Rougel Fmeasure,Sacrebleu
3,0.4261,0.394515,0.48787,0.264293,0.454074,18.949182


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



--- Model Training Complete and Saved to Google Drive ---


In [10]:
# --- Inference Example ---
def generate_headline(text, model, tokenizer, max_length=64):
    input_text = f"summarize: {text}"

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=max_input_length,
        truncation=True
    ).to(device)

    # Use beam search for higher quality generation
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )

    headline = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return headline

# Test with a sample article
sample_article = "The technology giant announced a massive restructuring today, leading to the departure of several key executives and a complete overhaul of its research and development division, signaling a new direction for the company's focus on AI and machine learning."
generated_headline = generate_headline(sample_article, model, tokenizer)

print("\n--- Inference Test ---")
print(f"Article: {sample_article[:100]}...")
print(f"Generated Headline: **{generated_headline}**")


--- Inference Test ---
Article: The technology giant announced a massive restructuring today, leading to the departure of several ke...
Generated Headline: **technology giant announces massive restructuring**
