# Preliminary Steps for T5 with Separate Attention Layer

## Preprocessing

- **Text Cleaning**: Clean the text data, removing any irrelevant characters, correcting formatting issues, and standardizing text for better model performance.

- **Tokenization**: Convert text data into a format suitable for the T5 model, typically using a tokenizer specific to T5.

- **Data Splitting**: Split the dataset into training, validation, and testing sets.

In [None]:
!pip install transformers



In [None]:
import pandas as pd
from transformers import T5Tokenizer
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('NYT_Dataset.csv')

# Data Cleaning Function
def clean_text(text):
    if pd.isna(text):
        return ""  # Return an empty string if the text is NaN
    # Strip leading/trailing whitespace, replace newline characters, and other potential cleaning steps
    return text.strip().replace("\n", " ")

data['abstract'] = data['abstract'].apply(clean_text)
data['title'] = data['title'].apply(clean_text)

# Initialize T5 Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Function to tokenize text
def tokenize_texts(text):
    # Encode the texts using the T5 tokenizer. You might want to adjust the max_length depending on your specific needs
    return tokenizer.encode(text, max_length=512, truncation=True, return_tensors="pt")

# Tokenize abstracts (inputs) and titles (targets)
data['tokenized_abstracts'] = data['abstract'].apply(tokenize_texts)
data['tokenized_titles'] = data['title'].apply(tokenize_texts)

# Split the data into training, validation, and test sets
train, dev = train_test_split(data, test_size=0.2, random_state=42)
validation, test = train_test_split(dev, test_size=0.5, random_state=42)

# Display the structure of the split data
print(f"Training Set: {len(train)} samples")
print(f"Validation Set: {len(validation)} samples")
print(f"Test Set: {len(test)} samples")

# Save the cleaned dataset
columns_to_save = ['ID', 'title', 'topic', 'abstract', 'Date', 'keywords']
data[columns_to_save].to_csv('cleaned_NYT_Dataset.csv' , index=False)
print(f"Dataset saved as {'cleaned_NYT_Dataset.csv'}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training Set: 85204 samples
Validation Set: 10651 samples
Test Set: 10651 samples
Dataset saved as cleaned_NYT_Dataset.csv


## Word2Vec Embedding for Keywords

In [None]:
import gensim.downloader as api
import numpy as np

# Load pre-trained Word2Vec embeddings
word_vectors = api.load('word2vec-google-news-300')  # 300-dimensional vectors

# Function to get embeddings for a keyword
def get_embedding(keyword):
    try:
        return word_vectors[keyword]
    except KeyError:
        return np.zeros(300)  # Return a zero vector if keyword not in vocabulary




In [None]:
def get_keyword_ids(keywords, max_keywords, keyword_to_id):
    keyword_ids = [keyword_to_id.get(keyword, 0) for keyword in keywords.split(',')[:max_keywords]]
    if len(keyword_ids) < max_keywords:
        keyword_ids += [0] * (max_keywords - len(keyword_ids))  # Padding with 0 (assuming 0 is the index for unknown keywords)
    return keyword_ids

In [None]:
import pandas as pd

unique_keywords = set()
for index, row in data.iterrows():
    if pd.notna(row['keywords']):  # Check if the 'keywords' value is not NaN
        unique_keywords.update(row['keywords'].split(','))

keyword_to_id = {keyword: i for i, keyword in enumerate(unique_keywords)}

## Initialization of T5 Model, Tokenizer, and Device Setup

- Category and Keyword Embedding
- Integration in Data Preparation



In [None]:
import pandas as pd

# Check for unique categories
unique_categories = data['topic'].unique()  # Replace 'category_column_name' with the actual column name
num_categories = len(unique_categories)
print(f"Number of unique categories: {num_categories}")

# Check for unique keywords
# Assuming keywords are stored in a single column separated by commas
all_keywords = set()
data['keywords'].dropna().str.split(',').apply(all_keywords.update)  # Replace 'keywords_column_name' with the actual column name
num_keywords = len(all_keywords)
print(f"Number of unique keywords: {num_keywords}")


Number of unique categories: 3
Number of unique keywords: 48268


In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.nn.utils.rnn import pad_sequence

# Load the tokenizer and the T5 model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model.to(device)


class T5CustomDataset(Dataset):
    def __init__(self, data, tokenizer, keyword_vectors, max_len=512, max_keywords=10):
        self.tokenizer = tokenizer
        self.data = data
        self.keyword_vectors = keyword_vectors
        self.max_len = max_len
        self.max_keywords = max_keywords

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_text = f"summarize: {item['abstract']}"
        target_text = item['title']

        encoding = self.tokenizer(input_text, max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt")
        target_encoding = self.tokenizer(target_text, max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt")

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        labels = target_encoding['input_ids'].squeeze(0)

        # Handle keywords
        keywords = item['keywords'].split(',')[:self.max_keywords]
        keyword_embeddings = torch.stack([self.keyword_vectors.get(kw, torch.zeros(self.keyword_vectors['example_keyword'].shape)) for kw in keywords])
        if len(keyword_embeddings) < self.max_keywords:
            padding = torch.zeros(self.max_keywords - len(keyword_embeddings), keyword_embeddings.shape[1])
            keyword_embeddings = torch.cat([keyword_embeddings, padding], dim=0)

        return {
            'input_ids': input_ids,
            'labels': labels,
            'keyword_embeddings': keyword_embeddings,
            'attention_mask': input_ids.ne(self.tokenizer.pad_token_id).int()
        }

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    attention_masks = torch.stack([item['attention_mask'] for item in batch])
    keyword_embeddings = torch.stack([item['keyword_embeddings'] for item in batch])

    return {
        'input_ids': input_ids,
        'labels': labels,
        'attention_mask': attention_masks,
        'keyword_embeddings': keyword_embeddings
    }


batch_size = 16

# Assuming word_vectors is already loaded and keyword_to_id is properly initialized
keyword_vectors = torch.stack([torch.from_numpy(word_vectors[kw]) if kw in word_vectors else torch.zeros(300) for kw in keyword_to_id.keys()])

train_loader = DataLoader(T5CustomDataset(train, tokenizer, keyword_vectors), batch_size=16, shuffle=True, collate_fn=collate_fn)
validation_loader = DataLoader(T5CustomDataset(validation, tokenizer, keyword_vectors), batch_size=16, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(T5CustomDataset(test, tokenizer, keyword_vectors), batch_size=16, shuffle=False, collate_fn=collate_fn)

class ModifiedT5Model(nn.Module):
    def __init__(self, t5_model, emb_dim):
        super(ModifiedT5Model, self).__init__()
        self.t5_model = t5_model
        self.special_attention = nn.Linear(emb_dim * 2, emb_dim)  # Double the emb_dim because of concatenation

    def forward(self, input_ids, attention_mask, labels, keyword_embeddings):
        input_embeddings = self.t5_model.get_input_embeddings()(input_ids)

        # Average the keyword embeddings and concatenate
        keyword_embeddings_mean = keyword_embeddings.mean(dim=1)
        expanded_keyword_embeddings = keyword_embeddings_mean.unsqueeze(1).expand(-1, input_ids.size(1), -1)
        combined_embeddings = torch.cat((input_embeddings, expanded_keyword_embeddings), dim=-1)

        enhanced_embeddings = self.special_attention(combined_embeddings)

        outputs = self.t5_model(inputs_embeds=enhanced_embeddings, attention_mask=attention_mask, labels=labels)

        # Check if 'loss' is in the outputs, and if so, return it properly
        if 'loss' in outputs:
            return outputs.loss, outputs.logits
        else:
            return None, outputs.logits

# Assuming num_keywords and emb_dim are defined based on your dataset and embedding size
num_keywords = 48268
emb_dim = 512  # Embedding dimension to match the T5 model's hidden size
modified_t5_model = ModifiedT5Model(t5_model, emb_dim).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Training & Evaluation

- **Embedding Handling**: The training loop now retrieves category and keyword embeddings and combines them. This combined embedding is then used to enhance the text input embeddings.

- **Model Input Adjustments**: Depending on how your model is designed to accept input embeddings, you might need to modify the model.forward method or how the embeddings are passed to the model. This example simply adds the extra embeddings to the text embeddings for demonstration. In practice, you might want to integrate them more seamlessly, perhaps through a custom layer or attention mechanism as discussed previously.

- **Checkpointing and Monitoring**: The training progress and checkpointing remain the same, ensuring you can monitor the training process and recover or start from checkpoints if necessary.

## Metric Evaluation

In [None]:
!pip install tqdm



In [None]:
!pip install nltk rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import nltk
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu

# Function to convert ids to text
def ids_to_text(ids, tokenizer):
    return tokenizer.decode(ids, skip_special_tokens=True)

# Function to calculate BLEU and ROUGE scores
def calculate_metrics(references, hypotheses):
    # BLEU
    bleu_score = corpus_bleu([[ref.split()] for ref in references], [hyp.split() for hyp in hypotheses])

    # ROUGE
    rouge = Rouge()
    rouge_score = rouge.get_scores(hypotheses, references, avg=True)

    return bleu_score, rouge_score

## Training Loop

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import torch

# Assume 'model' is an instance of 'ModifiedT5Model' which has been modified to take 'keyword_ids'
model = modified_t5_model  # Ensure this is your modified model instance that takes keyword embeddings
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

# Training parameters
epochs = 2
batch_size = 16
best_val_loss = float('inf')
patience_counter = 0
patience = 2

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        keyword_embeddings = batch['keyword_embeddings'].to(device)

        loss, logits = modified_t5_model(input_ids, attention_mask, labels, keyword_embeddings)

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=f"{total_loss / (progress_bar.n + 1):.2f}")

    # Save a checkpoint at the end of each epoch
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, f'checkpoint_epoch_{epoch}.pth')

print(f"Training completed with total loss: {total_loss}")

In [None]:
import time

def validate_and_calculate_scores(model, val_loader, tokenizer):
    model.eval()
    total_loss = 0
    references = []
    hypotheses = []
    start_time = time.time()  # Start timing

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['target_ids'].to(device)

            # Generate outputs
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50, no_repeat_ngram_size=2, early_stopping=True)
            loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
            total_loss += loss.item()

            for label, output in zip(labels, outputs):
                decoded_label = tokenizer.decode(label, skip_special_tokens=True).strip()
                decoded_output = tokenizer.decode(output, skip_special_tokens=True).strip()
                if decoded_label:
                    references.append(decoded_label)
                    hypotheses.append(decoded_output)

    evaluation_time = time.time() - start_time  # End timing
    print(f"Evaluation took {evaluation_time:.2f} seconds")

    if not references or not hypotheses:
        raise ValueError("No valid data found for evaluation. Please check your dataset.")

    try:
        bleu, rouge = calculate_metrics(references, hypotheses)
    except ValueError as e:
        print(f"Error calculating metrics: {e}")
        bleu, rouge = 0, 0

    return total_loss / len(val_loader), bleu, rouge, evaluation_time

# Run evaluation independently
val_loss, bleu_score, rouge_scores, eval_time = validate_and_calculate_scores(model, validation_loader, tokenizer)
print(f"Validation Loss: {val_loss:.2f}, BLEU Score: {bleu_score:.2f}, ROUGE Scores: {rouge_scores}, Evaluation Time: {eval_time:.2f} seconds")

In [None]:
def display_input_output_pairs(model, data_loader, tokenizer, attribute_embeddings, category_map, keyword_map, num_pairs=5):
    model.eval()
    batch = next(iter(data_loader))  # Get one batch from the DataLoader

    # Prepare category and keyword embeddings
    category_ids = batch['category_ids'].to(device)
    keyword_ids = batch['keyword_ids'].to(device)
    category_emb, keyword_emb = attribute_embeddings(category_ids, keyword_ids)
    extra_emb = category_emb + keyword_emb

    # Prepare input embeddings
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    text_embeddings = model.get_input_embeddings()(input_ids)

    # Combine text and extra embeddings and use for generating outputs
    enhanced_text_embeddings = text_embeddings + extra_emb.unsqueeze(1).expand(-1, text_embeddings.size(1), -1)
    outputs = model.generate(inputs_embeds=enhanced_text_embeddings, attention_mask=attention_mask, max_length=50, no_repeat_ngram_size=2, early_stopping=True)

    print("Displaying Input-Output Pairs:\n")
    for i in range(min(num_pairs, len(batch['input_ids']))):
        input_text = tokenizer.decode(input_ids[i], skip_special_tokens=True)
        generated_title = tokenizer.decode(outputs[i], skip_special_tokens=True)
        reference_title = tokenizer.decode(labels[i], skip_special_tokens=True)

        # Decode category and keywords for display
        category_name = category_map[category_ids[i].item()]
        keywords_names = [keyword_map[kw_id] for kw_id in keyword_ids[i] if kw_id in keyword_map]

        print(f"Input Abstract {i+1}: {input_text}")
        print(f"Generated Title {i+1}: {generated_title}")
        print(f"Reference Title {i+1}: {reference_title}")
        print(f"Category: {category_name}")
        print(f"Keywords: {', '.join(keywords_names)}\n")

# Assuming model, validation_loader, tokenizer, attribute_embeddings, category_map, and keyword_map are already defined and set up
display_input_output_pairs(model, validation_loader, tokenizer, attribute_embeddings, category_map, keyword_map, num_pairs=5)
