title


In [1]:
import pandas as pd

train_df = pd.read_csv('samsum_csv_data/train.csv')
validate_df = pd.read_csv('samsum_csv_data/validation.csv')

poc_train_df = train_df.head(500)
poc_validate_df = validate_df.head(100)

In [2]:
from datasets import Dataset # Import the Dataset class
from transformers import AutoTokenizer

# Convert pandas DataFrames to Hugging Face Dataset objects
# This is a lightweight conversion, doesn't copy data unnecessarily
hf_train_dataset = Dataset.from_pandas(poc_train_df)
hf_validate_dataset = Dataset.from_pandas(poc_validate_df)

print(f"\nHugging Face Train Dataset size: {len(hf_train_dataset)}")
print(f"Hugging Face Validation Dataset size: {len(hf_validate_dataset)}")

# Define the model name for the tokenizer (using 'bart-base' for efficiency)
MODEL_NAME = "t5-small"

# Load the tokenizer. This downloads the vocabulary and tokenization rules.
print(f"\nLoading tokenizer for model: {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


  from .autonotebook import tqdm as notebook_tqdm



Hugging Face Train Dataset size: 500
Hugging Face Validation Dataset size: 100

Loading tokenizer for model: t5-small...


In [3]:
# Assuming MAX_INPUT_LENGTH and MAX_TARGET_LENGTH are already defined
# Consider if T5-small fits 1024. For a very small model, 512 might be more common,
# but 1024 is generally fine if memory allows with batch size 1.
MAX_INPUT_LENGTH = 1024 # BART's typical maximum input length
MAX_TARGET_LENGTH = 128 # Reasonable maximum length for summaries
def preprocess_function(examples):
    # --- IMPORTANT T5 ADJUSTMENT: Add a task prefix to the input dialogue ---
    # T5 models expect a task prefix like "summarize: " for summarization tasks.
    inputs_with_prefix = [f"summarize: {dialogue}" for dialogue in examples["dialogue"]]

    model_inputs = tokenizer(
        inputs_with_prefix, # Use the dialogues with the T5 prefix
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        text_target=examples["summary"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print(f"\nPreprocessing training data (tokenizing and aligning lengths)...")
tokenized_hf_train_dataset = hf_train_dataset.map(
    preprocess_function,
    batched=True, # Process examples in batches for speed
    remove_columns=['id', 'dialogue', 'summary'] # Remove original text columns
)

print(f"Preprocessing validation data...")
tokenized_hf_validate_dataset = hf_validate_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['id', 'dialogue', 'summary']
)

print("\nData preprocessing complete.")
print("Sample of tokenized training data structure:")
print(tokenized_hf_train_dataset[0].keys()) # Show what keys are now in the dataset
print(f"Input IDs length: {len(tokenized_hf_train_dataset[0]['input_ids'])}")
print(f"Labels length: {len(tokenized_hf_train_dataset[0]['labels'])}")


Preprocessing training data (tokenizing and aligning lengths)...


Map: 100%|██████████| 500/500 [00:00<00:00, 3688.73 examples/s]


Preprocessing validation data...


Map: 100%|██████████| 100/100 [00:00<00:00, 3773.48 examples/s]


Data preprocessing complete.
Sample of tokenized training data structure:
dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs length: 1024
Labels length: 128





In [7]:
import torch
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
# step 4
# Ensure the dataset format is set to PyTorch tensors
# This is crucial before passing to the Trainer
tokenized_hf_train_dataset.set_format("torch")
tokenized_hf_validate_dataset.set_format("torch")

# Load the pre-trained BART model for sequence-to-sequence tasks
# This downloads the model weights and architecture for 'bart-base'
MODEL_NAME = "t5-small" # Using the same model name as for tokenizer
print(f"\nLoading t5 model for sequence-to-sequence: {MODEL_NAME}...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to("cpu")
print(f"Model device after loading: {model.device}") # <-- ADD THIS LINE
print("Model loaded successfully.")

# Configure Training Arguments
# These define how the training will proceed (epochs, batch size, logging, etc.)
output_dir = "./t5_samsum_poc_results" # Directory to save model checkpoints and logs
print(f"\nSetting up Training Arguments. Output directory: {output_dir}")
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,                  # Number of training epochs (small for POC)
    per_device_train_batch_size=1,       # Batch size per GPU/CPU for training
    per_device_eval_batch_size=2,        # Batch size per GPU/CPU for evaluation
    warmup_steps=10,                     # Number of steps for learning rate warmup
    weight_decay=0.01,                   # L2 regularization to prevent overfitting
    logging_dir=f"{output_dir}/logs",   # Directory for TensorBoard logs
    logging_steps=5,                     # Log training metrics every N steps
    eval_strategy="steps",               # Evaluate every N steps 
    eval_steps=10,                       # How often to run evaluation 
    save_steps=10,                       # How often to save model checkpoints 
    report_to="none",                    # Do not report to external services like Weights & Biases 
    gradient_accumulation_steps=4,       # Accumulate gradients over N steps 
    # predict_with_generate is NOT a TrainingArguments parameter in your version,
    # it will be passed directly to the Trainer.
    load_best_model_at_end=True,         # Load the best model found during training 
    metric_for_best_model="rouge1",      # Metric to monitor for best model selection 
    greater_is_better=True,              # Higher ROUGE-1 is better 
)

print("\nTraining Arguments configured.")


Loading t5 model for sequence-to-sequence: t5-small...
Model device after loading: cpu
Model loaded successfully.

Setting up Training Arguments. Output directory: ./t5_samsum_poc_results

Training Arguments configured.


In [8]:
# step 5
import evaluate
import numpy as np

# Load the ROUGE metric (if not already loaded in the current kernel session)
print("\nLoading ROUGE metric...")
rouge_metric = evaluate.load("rouge")

# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # --- IMPORTANT FIX HERE: Handle tuple output from Trainer ---
    # If predictions is a tuple, assume the actual logits/generated IDs are the first element.
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    # --- END IMPORTANT FIX ---

    # Now, check the shape of predictions. If it's 3D, it's likely logits.
    # We need to convert logits to token IDs by taking the argmax.
    if predictions.ndim == 3:
        predictions = np.argmax(predictions, axis=-1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process for ROUGE: remove extra whitespace and newlines
    decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split()) for label in decoded_labels]

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    result = {k: round(v * 100, 4) for k, v in result.items()}
    return result

print("Metric computation function defined.")
if len(tokenized_hf_train_dataset) > 0:
    sample_batch = tokenized_hf_train_dataset[0]
    if isinstance(sample_batch, dict):
        # Check a tensor from the sample batch (e.g., 'input_ids')
        if 'input_ids' in sample_batch and isinstance(sample_batch['input_ids'], torch.Tensor):
            print(f"Sample train input_ids device: {sample_batch['input_ids'].device}")
        else:
            print("Sample batch does not contain 'input_ids' tensor, or it's not a tensor.")
    else:
        print("Sample batch is not a dictionary.")
else:
    print("Training dataset is empty, cannot check sample batch device.")

# Initialize the Hugging Face Trainer (this part remains the same as it worked last time)
print("\nInitializing the Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hf_train_dataset,
    eval_dataset=tokenized_hf_validate_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Trainer initialized successfully. Ready for training.")


Loading ROUGE metric...
Metric computation function defined.
Sample train input_ids device: cpu

Initializing the Trainer...


  trainer = Trainer(


Trainer initialized successfully. Ready for training.


In [9]:
#step6

# Start the training process
print("\nStarting model training...")
train_result = trainer.train()
print("Training complete.")

# Save the trained model and tokenizer
trainer.save_model() # Saves the model and tokenizer to the output_dir specified in TrainingArguments
# For good measure, you can also save the tokenizer explicitly if desired (though save_model usually handles it)
# tokenizer.save_pretrained(training_args.output_dir)

# Save training metrics (optional, but good for tracking progress)
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

print(f"\nModel and tokenizer saved to: {training_args.output_dir}")
print("Training metrics logged and saved.")

# Optionally, you can also run a final evaluation on the validation set after training
print("\nRunning final evaluation on the validation set...")
eval_metrics = trainer.evaluate(eval_dataset=tokenized_hf_validate_dataset)
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)
print("Final evaluation complete.")
print(eval_metrics) # Print the evaluation results


Starting model training...


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 598.00 MiB. GPU 0 has a total capacity of 5.61 GiB of which 596.75 MiB is free. Including non-PyTorch memory, this process has 4.30 GiB memory in use. Of the allocated memory 3.58 GiB is allocated by PyTorch, and 647.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)