## Fine tune + train T5-small 
### the following code was run in a google colab for better performance

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
import nltk

# Ensure NLTK resources are available
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/owendolan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
### USE THE FOLLOWING PATH IF ON COLAB
"""
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/email_summaries_data.json'
df = pd.read_json(file_path, lines=True)
"""

### USE THE FOLLOWING IF RUNNING LOCALLY
file_path = "../../data/email_summaries_data.json"
df = pd.read_json(file_path, lines=True)



# ensure json has the correct format
assert 'email' in df.columns and 'summary' in df.columns, "JSON must contain 'email' and 'summary' columns."

# split the data into training and validation sets
train_texts, val_texts, train_summaries, val_summaries = train_test_split(df['email'], df['summary'], test_size=0.2, random_state=42)

print(f"training: {len(train_texts)}")
print(f"validation: {len(val_texts)}")


training: 96956
validation: 24239


In [5]:
class EmailSummaryDataset(Dataset):  

    # initializes the dataset with texts, summaries, a tokenizer, and max token length
    def __init__(self, texts, summaries, tokenizer, max_length=256): 
        self.texts = texts  
        self.summaries = summaries 
        self.tokenizer = tokenizer 
        self.max_length = max_length  

    # returns the number of examples in the dataset
    def __len__(self):  
        return len(self.texts)

    # gets one example input text and summary at the given index
    def __getitem__(self, idx):  
        text = self.texts.iloc[idx] 
        summary = self.summaries.iloc[idx]  

        # tokenizes the input text, converts it to a tensor, and applies padding and truncation to fit the max length
        inputs = self.tokenizer(
            text, 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"  # returns tensors in PyTorch format
        )

        # tokenizes the summary similar to above with input
        labels = self.tokenizer(
            summary, 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"  # returns tensors in PyTorch format
        )

        # returns a dictionary containing input IDs, attention masks, and labels
        return {  
            "input_ids": inputs["input_ids"].squeeze(), 
            "attention_mask": inputs["attention_mask"].squeeze(),  
            "labels": labels["input_ids"].squeeze()  
        }


In [6]:
# method to train and evaluate model given hyperparameters 
def train_and_evaluate_model(learning_rate=5e-5, batch_size=4, num_beams=2, epochs=1):
   
    # load tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # prepare datasets and dataloaders
    train_dataset = EmailSummaryDataset(train_texts, train_summaries, tokenizer)
    val_dataset = EmailSummaryDataset(val_texts, val_summaries, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # set optimizer to adam
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # start the training Loop
    model.train()

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        total_loss = 0 # track losss 

        ## ------ used GPT to help produce the below batch input 
        for step, batch in enumerate(tqdm(train_loader, desc="Training", ncols=100)):

            # perform a single training step calculate the loss and update the weights using Adam
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # comput training loss 
        avg_train_loss = total_loss / len(train_loader)
        print(f"Average Training Loss: {avg_train_loss:.4f}")



    # part 2, evaluation Loop
    model.eval()
    total_bleu = 0 # set inital bleu score
    with torch.no_grad():

        for batch in tqdm(val_loader, desc="Evaluating", ncols=100):

            # produce one instancew of eval run
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # generate predictions
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, num_beams=num_beams, max_length=150, early_stopping=True)

            # decode predictions and references
            predicted_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
            reference_summary = tokenizer.decode(labels[0], skip_special_tokens=True)

            # calculate bleu 
            bleu_score = sentence_bleu([reference_summary.split()], predicted_summary.split())
            total_bleu += bleu_score

    avg_bleu_score = total_bleu / len(val_loader)
    print(f"average BLEU Score: {avg_bleu_score:.4f}")

    # return bleu and loss 
    return avg_train_loss, avg_bleu_score


In [None]:
# configuration 1

learning_rate = 5e-5
batch_size = 4
num_beams = 2

# run training and evaluation
train_loss, bleu_score = train_and_evaluate_model(learning_rate, batch_size, num_beams)

print(f"Configuration 1: Default Hyperparameters")
print(f"Training Loss: {train_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}")

In [None]:
# configuration 2: 

learning_rate = 1e-4
batch_size = 4
num_beams = 2

# run training and evaluation
train_loss, bleu_score = train_and_evaluate_model(learning_rate, batch_size, num_beams)

print(f"Configuration 2: Higher Learning Rate")
print(f"Training Loss: {train_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}")

Epoch 1/1


Training Epoch 1: 100%|████████████████████████████████████| 24239/24239 [16:22:56<00:00,  2.43s/it]


In [None]:
# configuration 3:

learning_rate = 5e-5
batch_size = 8
num_beams = 2

# run training and evaluation
train_loss, bleu_score = train_and_evaluate_model(learning_rate, batch_size, num_beams)

print(f"Configuration 3: Larger Batch Size")
print(f"Training Loss: {train_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}")

In [None]:
# configuration 4: 

learning_rate = 1e-5
batch_size = 4
num_beams = 4

# run training and evaluation
train_loss, bleu_score = train_and_evaluate_model(learning_rate, batch_size, num_beams)

print(f"Configuration 4: More Beams, Lower Learning Rate")
print(f"Training Loss: {train_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}")

In [None]:
"""
commented out so the model does not overwrite 

The below code was used to save the particular model with the best performance to be called from the summarize_t5 output. 
"""

# # Directory to save the model
# output_dir = "fine_tuned_summary_t5"

# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# # Save model and tokenizer
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

# print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to fine_tuned_summary_t5


In [None]:
# load the fine-tuned model for testing
model = T5ForConditionalGeneration.from_pretrained("saved_t5_summary_model")
tokenizer = T5Tokenizer.from_pretrained("saved_t5_summary_model")

model.to(device)

# generate a summary for a test email
test_email = "I'm free for coffee on the first day of the conference, around 3 pm. There's a great little café near the conference venue."
inputs = tokenizer(test_email, return_tensors="pt", max_length=512, truncation=True).to(device)
outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=2, early_stopping=True)

print("Generated Summary:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Generated Summary:
Michael is available for coffee on the first day of the conference, at 3 pm, near the conference venue. He offers to meet for coffee on the first day of the conference and offers a café near the venue.
