## Fine tune + train GPT2-distilled 
### the following code was run in a google colab for better performance

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm
import torch
import pandas as pd
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/owendolan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
### USE THE FOLLOWING PATH IF ON COLAB
"""
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/email_summaries_data.json'
df = pd.read_json(file_path, lines=True)
"""

### USE THE FOLLOWING IF RUNNING LOCALLY
file_path = "../../data/email_summaries_data.json"
df = pd.read_json(file_path, lines=True)

# ensure json has the correct format
assert 'email' in df.columns and 'summary' in df.columns, "JSON must contain 'email' and 'summary' columns."

# split the data into training and validation sets
train_texts, val_texts, train_summaries, val_summaries = train_test_split(df['email'], df['summary'], test_size=0.2, random_state=42)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 96956
Validation samples: 24239


In [16]:
class EmailSummaryDataset(Dataset):

    # initializes the dataset with texts, summaries, a tokenizer, and max token length
    def __init__(self, texts, summaries, tokenizer, max_length=256):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    # returns the number of examples in the dataset
    def __len__(self):
        return len(self.texts)

    # gets one example input text and summary at the given index
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        summary = self.summaries.iloc[idx]

        # concatenate text and summary as a single sequence for GPT-2
        combined = f"Summarize: {text} Summary: {summary}"

        # tokenizes the summary similar to above with input
        inputs = self.tokenizer(
            combined,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # returns a dictionary containing input IDs, attention masks, and labels
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze()
        }


In [17]:
# method to train and evaluate model given hyperparameters 
def train_and_evaluate_model(learning_rate=5e-5, batch_size=4, max_length=256, epochs=1):

    # load tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
    model = GPT2LMHeadModel.from_pretrained("distilgpt2")
    
    # set padding token and adjust padding side -- this fixes an issue with inputs being wrong size (not sure why this is)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"  # Correct padding for decoder-only models
    model.config.pad_token_id = tokenizer.pad_token_id
    
    # resize the token embeddings
    model.resize_token_embeddings(len(tokenizer))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # setup the datasets
    train_dataset = EmailSummaryDataset(train_texts, train_summaries, tokenizer, max_length)
    val_dataset = EmailSummaryDataset(val_texts, val_summaries, tokenizer, max_length)

    # setup the data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # set optimizer to adam
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # start training loop
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        total_loss = 0

        # perform a single training step calculate the loss and update the weights using Adam
        for batch in tqdm(train_loader, desc="Training", ncols=100):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # calculate outputs and loss 
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item()

            # perform back propigation to update values
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # calculate training loss 
        avg_train_loss = total_loss / len(train_loader)
        print(f"Average Training Loss: {avg_train_loss:.4f}")

    # run model evaluation loop
    model.eval()

    total_bleu = 0 # set bleu score
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating", ncols=100):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # generate summaries
            outputs = model.generate(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                max_new_tokens=50  
            )
            predicted_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # gets reference summary + calculates BLEU
            reference_summary = tokenizer.decode(input_ids[0], skip_special_tokens=True)
            bleu_score = sentence_bleu([reference_summary.split()], predicted_summary.split())
            total_bleu += bleu_score

    # print bleu score 
    avg_bleu_score = total_bleu / len(val_loader)
    print(f"Average BLEU Score: {avg_bleu_score:.4f}")

    return avg_train_loss, avg_bleu_score # return values


In [None]:
# configuration 1:

learning_rate = 5e-5
batch_size = 4
max_length = 256
epochs = 1

train_loss, bleu_score = train_and_evaluate_model(learning_rate, batch_size, max_length, epochs)
print(f"Configuration 1: Default Hyperparameters")
print(f"Training Loss: {train_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}")

In [None]:
# configuration 2:

learning_rate = 1e-4
batch_size = 4
max_length = 256
epochs = 1

train_loss, bleu_score = train_and_evaluate_model(learning_rate, batch_size, max_length, epochs)
print(f"Configuration 2: Higher Learning Rate")
print(f"Training Loss: {train_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}")

NameError: name 'train_and_evaluate_model' is not defined

In [None]:
# configuration 3: 

learning_rate = 5e-5
batch_size = 8
max_length = 256
epochs = 1

train_loss, bleu_score = train_and_evaluate_model(learning_rate, batch_size, max_length, epochs)
print(f"Configuration 3: Larger Batch Size")
print(f"Training Loss: {train_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}")

In [None]:
# configuration 4:

learning_rate = 5e-5
batch_size = 4
max_length = 512
epochs = 1

train_loss, bleu_score = train_and_evaluate_model(learning_rate=5e-5, batch_size=4, max_length=512, epochs=1)
print(f"Configuration 4: Longer Sequences")
print(f"Training Loss: {train_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}")

In [None]:
"""
commented out so the model does not overwrite 

The below code was used to save the particular model with the best performance to be called from the summarize_t5 output. 
"""

# output_dir = "./gpt2-finetuned-email-summary"
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)
# print(f"Model saved to {output_dir}")


Model saved to ./gpt2-finetuned-email-summary


In [None]:
"""
TEST: LOAD MODEL IN 
"""

# fine-tuned model path
model_path = "./gpt2-finetuned-email-summary"

# load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set model to evaluation mode
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# test input
test_input = "we should eat yogurt, what do you think? I love sailing"
inputs = tokenizer(test_input, return_tensors="pt") # tokenize

# model output test
with torch.no_grad():
    outputs = model.generate(
        inputs["input_ids"],
        max_length=100,  # Maximum length of the generated text
        num_beams=5,    # Beam search for better results
        no_repeat_ngram_size=2,  # Avoid repetition
        early_stopping=True      # Stop generating early if end is reached
    )

# decode and print the generated text
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Input:", test_input)
print("Generated Output:", output_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input: we should eat yogurt, what do you think? I love sailing
Generated Output: we should eat yogurt, what do you think? I love sailing. and.. to.
