# Preliminary Steps for T5

## Preprocessing

- **Text Cleaning**: Clean the text data, removing any irrelevant characters, correcting formatting issues, and standardizing text for better model performance.

- **Tokenization**: Convert text data into a format suitable for the T5 model, typically using a tokenizer specific to T5.

- **Data Splitting**: Split the dataset into training, validation, and testing sets.

In [None]:
!pip install transformers



In [None]:
import pandas as pd
from transformers import T5Tokenizer
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('NYT_Dataset.csv')

# Data Cleaning Function
def clean_text(text):
    if pd.isna(text):
        return ""  # Return an empty string if the text is NaN
    # Strip leading/trailing whitespace, replace newline characters, and other potential cleaning steps
    return text.strip().replace("\n", " ")

data['abstract'] = data['abstract'].apply(clean_text)
data['title'] = data['title'].apply(clean_text)

# Initialize T5 Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Function to tokenize text
def tokenize_texts(text):
    # Encode the texts using the T5 tokenizer. You might want to adjust the max_length depending on your specific needs
    return tokenizer.encode(text, max_length=512, truncation=True, return_tensors="pt")

# Tokenize abstracts (inputs) and titles (targets)
data['tokenized_abstracts'] = data['abstract'].apply(tokenize_texts)
data['tokenized_titles'] = data['title'].apply(tokenize_texts)

# Split the data into training, validation, and test sets
train, dev = train_test_split(data, test_size=0.2, random_state=42)
validation, test = train_test_split(dev, test_size=0.5, random_state=42)

# Display the structure of the split data
print(f"Training Set: {len(train)} samples")
print(f"Validation Set: {len(validation)} samples")
print(f"Test Set: {len(test)} samples")

# Save the cleaned dataset
columns_to_save = ['ID', 'title', 'topic', 'abstract', 'Date', 'keywords']
data[columns_to_save].to_csv('cleaned_NYT_Dataset.csv' , index=False)
print(f"Dataset saved as {'cleaned_NYT_Dataset.csv'}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training Set: 85204 samples
Validation Set: 10651 samples
Test Set: 10651 samples
Dataset saved as cleaned_NYT_Dataset.csv


# Model Setup

## Data Formatting & PyTorch Data Loaders

- **Format the Data**: Adjust the dataset to be in a format that the T5 model can consume directly during training (e.g., abstract text, title text).

- **Dataset Loading and Batching**: Utilize PyTorch's DataLoader to handle batching and data shuffling.

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# If using a GPU (highly recommended for training speed), move the model to GPU
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model loaded and configured. Using device: {device}")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded and configured. Using device: cuda


In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch

# Function to format the data for T5
def format_for_t5(row):
    input_text = f"summarize: {row['abstract']}"
    target_text = row['title']
    return {
        "input_text": input_text,
        "target_text": target_text
    }

# Apply the formatting function to the dataset
formatted_train_data = train.apply(format_for_t5, axis=1)
formatted_validation_data = validation.apply(format_for_t5, axis=1)
formatted_test_data = test.apply(format_for_t5, axis=1)

# Filter out any rows where the title is empty (after formatting)
formatted_train_data = pd.DataFrame(list(formatted_train_data))
formatted_train_data = formatted_train_data[formatted_train_data['target_text'].str.strip() != '']

formatted_validation_data = pd.DataFrame(list(formatted_validation_data))
formatted_validation_data = formatted_validation_data[formatted_validation_data['target_text'].str.strip() != '']

formatted_test_data = pd.DataFrame(list(formatted_test_data))
formatted_test_data = formatted_test_data[formatted_test_data['target_text'].str.strip() != '']

In [None]:
# Function to tokenize the data for T5
def tokenize_for_t5(input_text, target_text):
    input_ids = tokenizer.encode(input_text, truncation=True, padding="max_length", max_length=512)
    target_ids = tokenizer.encode(target_text, truncation=True, padding="max_length", max_length=128)
    return input_ids, target_ids

# Apply tokenization
formatted_train_data[['input_ids', 'target_ids']] = formatted_train_data.apply(
    lambda row: tokenize_for_t5(row['input_text'], row['target_text']), axis=1, result_type="expand"
)
formatted_validation_data[['input_ids', 'target_ids']] = formatted_validation_data.apply(
    lambda row: tokenize_for_t5(row['input_text'], row['target_text']), axis=1, result_type="expand"
)
formatted_test_data[['input_ids', 'target_ids']] = formatted_test_data.apply(
    lambda row: tokenize_for_t5(row['input_text'], row['target_text']), axis=1, result_type="expand"
)

# Define a custom dataset class
class T5Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_ids = torch.tensor(item['input_ids'])
        attention_mask = torch.tensor([1 if token != tokenizer.pad_token_id else 0 for token in item['input_ids']])
        labels = torch.tensor(item['target_ids'])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Instantiate datasets
train_dataset = T5Dataset(formatted_train_data)
validation_dataset = T5Dataset(formatted_validation_data)
test_dataset = T5Dataset(formatted_test_data)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


# Training & Evaluation

In [None]:
!pip install tqdm



In [None]:
!pip install nltk rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import nltk
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu

# Function to convert ids to text
def ids_to_text(ids, tokenizer):
    return tokenizer.decode(ids, skip_special_tokens=True)

# Function to calculate BLEU and ROUGE scores
def calculate_metrics(references, hypotheses):
    # BLEU
    bleu_score = corpus_bleu([[ref.split()] for ref in references], [hyp.split() for hyp in hypotheses])

    # ROUGE
    rouge = Rouge()
    rouge_score = rouge.get_scores(hypotheses, references, avg=True)

    return bleu_score, rouge_score

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import torch


# Training parameters
epochs = 3
batch_size = 16
best_val_loss = float('inf')
patience_counter = 0
patience = 2

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=f"{total_loss / (progress_bar.n + 1):.2f}")

    print(f"Epoch {epoch} Training Loss: {total_loss / len(train_loader):.2f}")

    # Save a checkpoint at the end of each epoch
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, f'checkpoint_epoch_{epoch}.pth')


In [None]:
import time

def validate_and_calculate_scores(model, val_loader, tokenizer):
    model.eval()
    total_loss = 0
    references = []
    hypotheses = []
    start_time = time.time()  # Start timing

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Generate outputs
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50, no_repeat_ngram_size=2, early_stopping=True)
            loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
            total_loss += loss.item()

            for label, output in zip(labels, outputs):
                decoded_label = tokenizer.decode(label, skip_special_tokens=True).strip()
                decoded_output = tokenizer.decode(output, skip_special_tokens=True).strip()
                if decoded_label:
                    references.append(decoded_label)
                    hypotheses.append(decoded_output)

    evaluation_time = time.time() - start_time  # End timing
    print(f"Evaluation took {evaluation_time:.2f} seconds")

    if not references or not hypotheses:
        raise ValueError("No valid data found for evaluation. Please check your dataset.")

    try:
        bleu, rouge = calculate_metrics(references, hypotheses)
    except ValueError as e:
        print(f"Error calculating metrics: {e}")
        bleu, rouge = 0, 0

    return total_loss / len(val_loader), bleu, rouge, evaluation_time

# Run evaluation independently
val_loss, bleu_score, rouge_scores, eval_time = validate_and_calculate_scores(model, validation_loader, tokenizer)
print(f"Validation Loss: {val_loss:.2f}, BLEU Score: {bleu_score:.2f}, ROUGE Scores: {rouge_scores}, Evaluation Time: {eval_time:.2f} seconds")

In [None]:
def display_input_output_pairs(model, data_loader, tokenizer, num_pairs=5):
    model.eval()
    batch = next(iter(data_loader))  # Get one batch from the DataLoader
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # Generate outputs
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50, no_repeat_ngram_size=2, early_stopping=True)

    print("Displaying Input-Output Pairs:\n")
    for i in range(num_pairs):
        input_text = tokenizer.decode(input_ids[i], skip_special_tokens=True)
        generated_title = tokenizer.decode(outputs[i], skip_special_tokens=True)
        reference_title = tokenizer.decode(labels[i], skip_special_tokens=True)

        print(f"Input Abstract {i+1}: {input_text}")
        print(f"Generated Title {i+1}: {generated_title}")
        print(f"Reference Title {i+1}: {reference_title}\n")

# Assuming model, validation_loader, and tokenizer are already defined and set up
display_input_output_pairs(model, validation_loader, tokenizer, num_pairs=5)
