In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import os
from tqdm import tqdm


In [4]:
# Load the dataset from JSON file, with each object separated by a newline
file_path = "data/email_summaries_data.json"
df = pd.read_json(file_path, lines=True)

# Ensure the dataset has 'email' and 'summary' columns
assert 'email' in df.columns and 'summary' in df.columns, "JSON must contain 'email' and 'summary' columns."

# Split the data into training and validation sets
train_texts, val_texts, train_summaries, val_summaries = train_test_split(
    df['email'], df['summary'], test_size=0.2, random_state=42
)

# Display data statistics
print(f"Training size: {len(train_texts)}")
print(f"Validation size: {len(val_texts)}")


Training size: 96956
Validation size: 24239


In [18]:
class EmailSummaryDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=256):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        summary = self.summaries.iloc[idx]

        # Tokenize input and output
        inputs = self.tokenizer(
            text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        labels = self.tokenizer(
            summary, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze()
        }


In [19]:
# Load the tokenizer for T5-small distilled model
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Create the datasets
train_dataset = EmailSummaryDataset(train_texts, train_summaries, tokenizer)
val_dataset = EmailSummaryDataset(val_texts, val_summaries, tokenizer)

# DataLoader parameters
batch_size = 4  # Increased batch size for better GPU utilization
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [4]:
# Load the T5-small distilled model
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

# Set optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training configuration
epochs = 1
log_interval = 10


In [None]:
for epoch in range(epochs):
    model.train()
    print(f"Epoch {epoch + 1}/{epochs}")
    for step, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch + 1}", ncols=100)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()



Epoch 1/1


Training Epoch 1: 100%|████████████████████████████████████| 24239/24239 [16:22:56<00:00,  2.43s/it]


In [None]:
"""
commented out so the model does not overwrite 
"""

# # Directory to save the model
# output_dir = "fine_tuned_summary_t5"

# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# # Save model and tokenizer
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

# print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to fine_tuned_summary_t5


In [5]:
# Load the fine-tuned model for testing
model = T5ForConditionalGeneration.from_pretrained("saved_t5_summary_model")
tokenizer = T5Tokenizer.from_pretrained("saved_t5_summary_model")

model.to(device)

# Generate a summary for a test email
test_email = "I'm free for coffee on the first day of the conference, around 3 pm. There's a great little café near the conference venue."
inputs = tokenizer(test_email, return_tensors="pt", max_length=512, truncation=True).to(device)
outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=2, early_stopping=True)

print("Generated Summary:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Generated Summary:
Michael is available for coffee on the first day of the conference, at 3 pm, near the conference venue. He offers to meet for coffee on the first day of the conference and offers a café near the venue.
