In [None]:
!pip install -q "transformers==4.38.2" "datasets==2.18.0" "accelerate==0.27.2" \
                 "huggingface_hub==0.20.3" "peft==0.9.0" "rouge_score" "nltk" \
                 "jax==0.4.23" "jaxlib==0.4.23"

import os
os._exit(0)

In [1]:
import os
import torch
import nltk
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


MODEL_NAME = "t5-small"
OUTPUT_DIR = "/kaggle/working/t5-summarizer-lora"
DATA_PATH = "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/"
TRAIN_FILE = os.path.join(DATA_PATH, "train.csv")
VAL_FILE = os.path.join(DATA_PATH, "validation.csv")
TEST_FILE = os.path.join(DATA_PATH, "test.csv")
TRAIN_SAMPLES = 50000
VAL_SAMPLES = 5000
TEST_SAMPLES = 5000
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128
TASK_PREFIX = "summarize: "
nltk.download('punkt', quiet=True)


print(f"Loading tokenizer for {MODEL_NAME}...")
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer loaded.")


data_files = { "train": TRAIN_FILE, "validation": VAL_FILE, "test": TEST_FILE }
full_dataset = load_dataset("csv", data_files=data_files)
print("Subsetting the dataset...")
train_dataset = full_dataset['train'].shuffle(seed=30).select(range(TRAIN_SAMPLES))
val_dataset = full_dataset['validation'].shuffle(seed=30).select(range(VAL_SAMPLES))
test_dataset = full_dataset['test'].shuffle(seed=30).select(range(TEST_SAMPLES))

def preprocess_function(examples):
    inputs = [TASK_PREFIX + str(doc) for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples['highlights'], max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Applying tokenization to all subsets...")
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, num_proc=os.cpu_count())
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, num_proc=os.cpu_count())
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, num_proc=os.cpu_count())
print("Data preprocessing complete.")


print(f"Loading base model: {MODEL_NAME}")
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

print("Configuring LoRA...")
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, lora_config)
print("Model wrapped with LoRA.")
model.print_trainable_parameters() 

print("Initializing Seq2Seq data collator...")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
print("Model and data collator loaded.")

rouge = evaluate.load("rouge")
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value * 100 for key, value in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}
print("ROGE compute_metrics function defined.")

if not torch.cuda.is_available():
    print("WARNING: GPU not found. Training will be extremely slow.")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    
    evaluation_strategy="no", 
    save_strategy="no",        
    load_best_model_at_end=False,
    warmup_steps=300,
    weight_decay=0.01,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_strategy="steps",
    logging_steps=100,
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("--- Starting Model Fine-Tuning ---")
torch.cuda.empty_cache() # Clear cache
trainer.train()

print("--- Training Complete ---")
print("Model is trained.")

2025-11-02 12:21:38.233613: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762086098.257666   27686 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762086098.265026   27686 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading tokenizer for t5-small...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer loaded.
Subsetting the dataset...
Applying tokenization to all subsets...


Map (num_proc=4):   0%|          | 0/50000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Data preprocessing complete.
Loading base model: t5-small
Configuring LoRA...
Model wrapped with LoRA.
trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850403779272945
Initializing Seq2Seq data collator...
Model and data collator loaded.
ROGE compute_metrics function defined.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


--- Starting Model Fine-Tuning ---




Step,Training Loss
100,9.5602
200,8.3349
300,4.2286
400,1.5958
500,1.2833
600,1.1919
700,1.1754
800,1.1563
900,1.142
1000,1.1436


--- Training Complete ---
Model is trained.


In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import PeftModel

BASE_MODEL_NAME = "t5-small"
LORA_MODEL_PATH = "/kaggle/working/t5-summarizer-lora"

print(f"Loading base model: {BASE_MODEL_NAME}")
base_model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)

print(f"Loading LoRA adapters from: {LORA_MODEL_PATH}")
model = PeftModel.from_pretrained(base_model, LORA_MODEL_PATH)

tokenizer = T5Tokenizer.from_pretrained(LORA_MODEL_PATH)

print("Merging LoRA adapters into the base model...")
model = model.merge_and_unload()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print(f"Fine-tuned model is loaded, merged, and on {device}.")

Loading base model: t5-small


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading LoRA adapters from: /kaggle/working/t5-summarizer-lora
Merging LoRA adapters into the base model...
Fine-tuned model is loaded, merged, and on cuda.


In [3]:
import evaluate
import nltk
from tqdm import tqdm 


rouge = evaluate.load("rouge")
NUM_SAMPLES_TO_EVAL = 50

try:
    eval_samples = test_dataset.select(range(NUM_SAMPLES_TO_EVAL))
except NameError:
    print("Error: 'test_dataset' not in memory. Please re-run Phase 1, Cell 4.")
    raise

predictions = []
references = []

print(f"--- Generating {NUM_SAMPLES_TO_EVAL} summaries for evaluation ---")

for sample in tqdm(eval_samples):
    article = sample['article']
    reference_summary = sample['highlights']
    prompt = TASK_PREFIX + article
    input_ids = tokenizer(
        prompt, 
        max_length=MAX_INPUT_LENGTH, 
        truncation=True, 
        padding="max_length", 
        return_tensors="pt"
    ).input_ids.to(device) 

    
    output_ids = model.generate(
        input_ids=input_ids,
        max_length=MAX_TARGET_LENGTH,
        num_beams=4,  
        early_stopping=True,
        no_repeat_ngram_size=2 
    )
    
    
    predicted_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    predictions.append(predicted_summary)
    references.append(reference_summary)

print("Generation complete.")

print("\n--- Calculating Final ROUGE Scores ---")


cleaned_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in predictions]
cleaned_refs = ["\n".join(nltk.sent_tokenize(ref.strip())) for ref in references]

result = rouge.compute(
    predictions=cleaned_preds, 
    references=cleaned_refs, 
    use_stemmer=True
)
result = {key: value * 100 for key, value in result.items()}
result = {k: round(v, 4) for k, v in result.items()}

print(result)

print("\n--- Example Summaries (from Test Set) ---")
for i in range(3):
    print(f"\n===== EXAMPLE {i+1} =====")
    print(f"\nARTICLE:\n{eval_samples[i]['article'][:700]}...")
    print(f"\nREFERENCE SUMMARY (Human):\n{references[i]}")
    print(f"\nMODEL SUMMARY (Generated):\n{predictions[i]}")

print("\n\nTask 2 is complete.")

--- Generating 50 summaries for evaluation ---


100%|██████████| 50/50 [01:00<00:00,  1.22s/it]


Generation complete.

--- Calculating Final ROUGE Scores ---
{'rouge1': 38.5214, 'rouge2': 16.8723, 'rougeL': 25.6611, 'rougeLsum': 34.6478}

--- Example Summaries (from Test Set) ---

===== EXAMPLE 1 =====

ARTICLE:
US officials are expected to stop prosecuting families of American hostages who communicate with kidnappers abroad or raise funds and pay ransoms. A National Counterterrorism Center advisory group, ordered by the White House, is expected to recommend what would mark a radical shift in US hostage policy, ABC news reported on Sunday. The NCTC interviewed families of hostages, including the parents of journalist James Foley, who was killed by Islamic State fighters. The family of a US contractor held by Al-Qaeda militants, Warren Weinstein (above), confirmed they paid a ransom in an attempt to secure his release which was reported in the amount of $250,000 . Foley's mother Diane said that offic...

REFERENCE SUMMARY (Human):
Senior official said there will be 'zero chance' th

In [4]:

!zip -r t5_lora_model.zip /kaggle/working/t5-summarizer-lora

print("t5_lora_model.zip created successfully.")
print("Please download it from the /kaggle/working/ directory in the file browser.")

  adding: kaggle/working/t5-summarizer-lora/ (stored 0%)
  adding: kaggle/working/t5-summarizer-lora/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/t5-summarizer-lora/tokenizer_config.json (deflated 94%)
  adding: kaggle/working/t5-summarizer-lora/special_tokens_map.json (deflated 85%)
  adding: kaggle/working/t5-summarizer-lora/added_tokens.json (deflated 83%)
  adding: kaggle/working/t5-summarizer-lora/adapter_config.json (deflated 51%)
  adding: kaggle/working/t5-summarizer-lora/README.md (deflated 66%)
  adding: kaggle/working/t5-summarizer-lora/spiece.model (deflated 48%)
  adding: kaggle/working/t5-summarizer-lora/training_args.bin (deflated 51%)
t5_lora_model.zip created successfully.
Please download it from the /kaggle/working/ directory in the file browser.
