In [1]:
!pip install rouge nltk tqdm transformers

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


# I - Preprocessing

- **Data Cleaning**: Applies a cleaning function to the 'abstract' and 'title' columns.

- **Tokenization**: Applies a T5-small tokenizer to the abstracts and titles in the dataset, handling truncatin and tensor conversion.

- **Data Splitting**: Splitted into train-val-test, 80-10-10.

In [2]:
# Imports for data handling and machine learning
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the data
data = pd.read_csv('NYT_Dataset.csv')

# Function to clean text fields
def clean_text(text):
    return "" if pd.isna(text) else text.strip().replace("\n", " ")

# Function to format keywords from list-like string to a clean comma-separated string
def format_keywords(keyword_str):
    return ', '.join(eval(keyword_str)) if pd.notna(keyword_str) else ''

# Clean and format data fields
data['abstract'] = data['abstract'].apply(clean_text)
data['title'] = data['title'].apply(clean_text)
data['keywords'] = data['keywords'].apply(format_keywords)

# Combine keywords with abstracts
data['combined_abstract'] = data.apply(lambda x: f"{x['abstract']} Keywords: {x['keywords']}", axis=1)

# Initialize T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Function to tokenize text
def tokenize_texts(text):
    return tokenizer.encode(text, max_length=512, truncation=True, return_tensors="pt")

# Apply tokenization to the combined abstracts and titles
data['tokenized_abstracts'] = data['combined_abstract'].apply(tokenize_texts)
data['tokenized_titles'] = data['title'].apply(tokenize_texts)

# Split the data into train, validation, and test sets
train, dev = train_test_split(data, test_size=0.2, random_state=42)
validation, test = train_test_split(dev, test_size=0.5, random_state=42)

# Save the cleaned and preprocessed data to a new CSV for further use
data[['ID', 'title', 'topic', 'combined_abstract', 'Date', 'keywords']].to_csv('cleaned_NYT_Dataset.csv', index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


- **Model Configuration**: Pre-trained T5-small model for conditional generation configured to a device.

- **Data Formatting**: Prepares the data in a format suitable for T5, filtering non-empty target texts.

- **Tokenize Input**: Input and Target texts are tokenized inth T5 format, with padding and max length constraints.

In [3]:
# Model setup and configuration
model = T5ForConditionalGeneration.from_pretrained('t5-small')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Data formatting for T5
def format_for_t5(row):
    return {
        "input_text": f"summarize: {row['abstract']} Keywords: {row['keywords']}",
        "target_text": row['title']
    }

# Apply formatting to each split
formatted_train_data = train.apply(format_for_t5, axis=1).tolist()
formatted_validation_data = validation.apply(format_for_t5, axis=1).tolist()
formatted_test_data = test.apply(format_for_t5, axis=1).tolist()

# Convert to DataFrame and filter empty target texts
formatted_train_data = pd.DataFrame(formatted_train_data)
formatted_validation_data = pd.DataFrame(formatted_validation_data)
formatted_test_data = pd.DataFrame(formatted_test_data)

formatted_train_data = formatted_train_data[formatted_train_data['target_text'].str.strip() != '']
formatted_validation_data = formatted_validation_data[formatted_validation_data['target_text'].str.strip() != '']
formatted_test_data = formatted_test_data[formatted_test_data['target_text'].str.strip() != '']

# Filter and tokenize for T5
def tokenize_for_t5(input_text, target_text):
    input_ids = tokenizer.encode(input_text, truncation=True, padding="max_length", max_length=512)
    target_ids = tokenizer.encode(target_text, truncation=True, padding="max_length", max_length=128)
    return input_ids, target_ids

# Apply tokenization and expand into new columns
formatted_train_data[['input_ids', 'target_ids']] = formatted_train_data.apply(
    lambda row: tokenize_for_t5(row['input_text'], row['target_text']), axis=1, result_type="expand"
)
formatted_validation_data[['input_ids', 'target_ids']] = formatted_validation_data.apply(
    lambda row: tokenize_for_t5(row['input_text'], row['target_text']), axis=1, result_type="expand"
)
formatted_test_data[['input_ids', 'target_ids']] = formatted_test_data.apply(
    lambda row: tokenize_for_t5(row['input_text'], row['target_text']), axis=1, result_type="expand"
)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

- **Custom T5 Dataset**: Handles the dataset in a way compatible with PyTorch, including managing input IDs, attention masks, and labels.

- **Data Loaders**: Initialized to facilitate efficient batch processing during training and evaluation.

In [4]:
# Custom Dataset and DataLoader
class T5Dataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_ids = torch.tensor(item['input_ids'])
        labels = torch.tensor(item['target_ids'])
        attention_mask = torch.tensor([1 if token != tokenizer.pad_token_id else 0 for token in input_ids])
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

train_dataset = T5Dataset(formatted_train_data)
validation_dataset = T5Dataset(formatted_validation_data)
test_dataset = T5Dataset(formatted_test_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# II - Training & Evaluation

- **ID-to-Text Conversion**: Converts tokenized IDs back into natural language using the tokenizer.

- **Evaluation w/ Metrics & Samples**: BLEU and ROUGE scores are used to evaluate the text outputs, and some sample generated/real pairs of titles are produced to visualize the real quality.

In [5]:
# Imports for NLP metrics and progress tracking
import nltk
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm
import torch
import time

# Imports from Huggingface Transformers
from transformers import AdamW, get_linear_schedule_with_warmup

# Define utility functions
def ids_to_text(ids, tokenizer):
    return tokenizer.decode(ids, skip_special_tokens=True)

def calculate_metrics(references, hypotheses):
    bleu_score = corpus_bleu([[ref.split()] for ref in references], [hyp.split() for hyp in hypotheses])
    rouge = Rouge()
    rouge_score = rouge.get_scores(hypotheses, references, avg=True)
    return bleu_score, rouge_score

In [6]:
import sys

# Training parameters and setup
epochs = 2
batch_size = 16  # already defined, repeated for clarity
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

# Training loop
def train_model(model, train_loader, optimizer, scheduler, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}", file=sys.stdout)  # Ensure tqdm writes to stdout directly
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.update(1)  # Manually update the progress bar
            progress_bar.set_postfix(loss=f"{total_loss / progress_bar.n:.2f}")
        progress_bar.close()  # Close the progress bar at the end of each epoch

        print(f"Epoch {epoch} Training Loss: {total_loss / len(train_loader):.2f}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, f'checkpoint_epoch_{epoch}.pth')



In [7]:
# Train and evaluate
train_model(model, train_loader, optimizer, scheduler, device, epochs)

Epoch 1: 100%|██████████| 5326/5326 [34:49<00:00,  2.55it/s, loss=0.51]
Epoch 0 Training Loss: 0.51
Epoch 2: 100%|██████████| 5326/5326 [34:49<00:00,  2.55it/s, loss=0.41]
Epoch 1 Training Loss: 0.41
Epoch 3: 100%|██████████| 5326/5326 [34:50<00:00,  2.55it/s, loss=0.40]
Epoch 2 Training Loss: 0.40




ValueError: Reference is empty.

In [21]:
def validate_and_calculate_scores(model, val_loader, tokenizer, device):
    model.eval()
    total_loss = 0
    references, hypotheses = [], []
    start_time = time.time()

    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: value.to(device) for key, value in batch.items() if key != 'labels'}
            outputs = model.generate(**inputs, max_length=50, no_repeat_ngram_size=2, early_stopping=True)
            loss = model(**inputs, labels=batch['labels'].to(device)).loss
            total_loss += loss.item()

            for label, output in zip(batch['labels'], outputs):
                ref = ids_to_text(label, tokenizer)
                hyp = ids_to_text(output, tokenizer)

                if ref.strip() and hyp.strip():  # Only add non-empty references and hypotheses
                    references.append(ref)
                    hypotheses.append(hyp)

    # Only calculate metrics if there are valid references and hypotheses
    if references and hypotheses:
        metrics = calculate_metrics(references, hypotheses)
    else:
        print("Warning: No valid references or hypotheses were collected. Metrics cannot be calculated.")
        metrics = {'bleu_score': 0, 'rouge_score': {'rouge-1': {'f': 0, 'p': 0, 'r': 0}, 'rouge-2': {'f': 0, 'p': 0, 'r': 0}, 'rouge-l': {'f': 0, 'p': 0, 'r': 0}}}

    evaluation_time = time.time() - start_time
    return total_loss / len(val_loader), metrics, evaluation_time

In [22]:
val_loss, (bleu_score, rouge_scores), eval_time = validate_and_calculate_scores(model, validation_loader, tokenizer, device)


In [23]:
print(f"Validation Loss: {val_loss:.2f}, BLEU Score: {bleu_score:.2f}, ROUGE Scores: {rouge_scores}, Evaluation Time: {eval_time:.2f} seconds")

Validation Loss: 0.36, BLEU Score: 0.04, ROUGE Scores: {'rouge-1': {'r': 0.18308436294934585, 'p': 0.2040952874566997, 'f': 0.18728738779170992}, 'rouge-2': {'r': 0.058922172416697456, 'p': 0.0643394003201377, 'f': 0.05978576251215122}, 'rouge-l': {'r': 0.1714839715377435, 'p': 0.19070280405406656, 'f': 0.1751893408772694}}, Evaluation Time: 456.43 seconds


In [17]:
def display_input_output_pairs(model, data_loader, tokenizer, num_pairs=5):
    model.eval()
    batch = next(iter(data_loader))  # Get one batch from the DataLoader
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # Generate outputs
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50, no_repeat_ngram_size=2, early_stopping=True)

    print("Displaying Input-Output Pairs:\n")
    for i in range(num_pairs):
        input_text = tokenizer.decode(input_ids[i], skip_special_tokens=True)
        generated_title = tokenizer.decode(outputs[i], skip_special_tokens=True)
        reference_title = tokenizer.decode(labels[i], skip_special_tokens=True)

        print(f"Input Abstract {i+1}: {input_text}")
        print(f"Generated Title {i+1}: {generated_title}")
        print(f"Reference Title {i+1}: {reference_title}\n")

# Assuming model, validation_loader, and tokenizer are already defined and set up
display_input_output_pairs(model, validation_loader, tokenizer, num_pairs=5)


Displaying Input-Output Pairs:

Input Abstract 1: summarize: The Most Rev. Rowan Williams, the archbishop of Canterbury, announced a consensus to seek the pact among all parties to the ecclesiastical controversy over homosexuality. Keywords: Christians and Christianity, Homosexuality, Williams, Rowan, ANGLICAN CHURCHES, Religion and Churches
Generated Title 1: Archbishop of Canterbury Defends a Covenant on Homosexuality
Reference Title 1: Anglicans to Seek Pact to Prevent a Schism

Input Abstract 2: summarize: President Petro O. Poroshenko ordered forces to halt their fire in the east, but a rebel leader said the accord did not apply to the contested town of Debaltseve. Keywords: Ukraine, Debaltseve (Ukraine), Russia, Zakharchenko, Aleksandr, Poroshenko, Petro Olekseyevich, People's Militia (Donetsk People's Republic), Europe
Generated Title 2: Ukraine: President Orders Fire in East
Reference Title 2: Ukraine Cease-Fire Goes Into Effect, but Rebel Leader in Key Town Repudiates Accord

