In [2]:
# Import dependencies
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split


In [None]:
# define dataset class
class EmailSummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        email = row['email']
        summary = row['summary']

        input_text = f"Email: {email} Summary:"
        target_text = f"{summary}"

        input_encodings = self.tokenizer(
            input_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt"
        )
        target_encodings = self.tokenizer(
            target_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt"
        )

        return {
            "input_ids": input_encodings["input_ids"].squeeze(),
            "attention_mask": input_encodings["attention_mask"].squeeze(),
            "labels": target_encodings["input_ids"].squeeze(),
        }

# load the JSON dataset
file_path = "/Users/owendolan/Desktop/nlp-email-client/backend/data/email_summaries_data.json"
data = pd.read_json(file_path, lines=True)  # Adjust filename
train_data, val_data = train_test_split(data, test_size=0.1)

# initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a padding token, so use EOS

# create datasets
train_dataset = EmailSummaryDataset(train_data, tokenizer)
val_dataset = EmailSummaryDataset(val_data, tokenizer)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

In [None]:
# create data-loaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)


In [None]:
# load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))  # Adjust token embeddings for new pad_token

# move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 1
gradient_accumulation_steps = 8  # Accumulate gradients to simulate larger batch sizes
max_grad_norm = 1.0

# training loop
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch_idx, batch in enumerate(progress_bar):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps  # Normalize loss for gradient accumulation
        loss.backward()

        # Gradient clipping and optimizer step
        if (batch_idx + 1) % gradient_accumulation_steps == 0 or (batch_idx + 1) == len(train_dataloader):
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} finished. Loss: {epoch_loss / len(train_dataloader)}")


Epoch 1/1: 100%|██████████| 27269/27269 [20:27:17<00:00,  2.70s/it, loss=0.0685]   

Epoch 1 finished. Loss: 0.08955101794531983





In [None]:
# model validation loop
model.eval()
val_loss = 0

with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Validating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_loss += loss.item()

print(f"Validation Loss: {val_loss / len(val_dataloader)}")


Validating: 100%|██████████| 3030/3030 [38:47<00:00,  1.30it/s]

Validation Loss: 0.6914495435976746





In [None]:
# save the fine tuned model
output_dir = "./gpt2-finetuned-email-summary"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")


Model saved to ./gpt2-finetuned-email-summary


In [None]:
"""
TEST: LOAD MODEL IN 
"""

# fine-tuned model path
model_path = "./gpt2-finetuned-email-summary"

# load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set model to evaluation mode
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# test input
test_input = "we should eat yogurt, what do you think? I love sailing"
inputs = tokenizer(test_input, return_tensors="pt") # tokenize

# model output test
with torch.no_grad():
    outputs = model.generate(
        inputs["input_ids"],
        max_length=100,  # Maximum length of the generated text
        num_beams=5,    # Beam search for better results
        no_repeat_ngram_size=2,  # Avoid repetition
        early_stopping=True      # Stop generating early if end is reached
    )

# decode and print the generated text
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Input:", test_input)
print("Generated Output:", output_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input: we should eat yogurt, what do you think? I love sailing
Generated Output: we should eat yogurt, what do you think? I love sailing. and.. to.
