In [2]:
%pip install transformers datasets torch rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=8ed5c30dd4f6419c93401fad12226f53b7325f6deaf0567785a1eb4bac39a0a2
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from datasets import load_metric
from sklearn.model_selection import train_test_split
import torch.nn as nn

# Load dataset from CSV
df = pd.read_csv('/kaggle/input/ft-email-dataset/merged_email_data.csv')

# Custom dataset class
class EmailDataset(Dataset):
    '''
    Custom dataset class for email summarization task

    Args:
        tokenizer: PreTrainedTokenizer for tokenizing the text data
        data: DataFrame containing the text data
        max_length: Maximum length of the input sequence
        summary_length: Length of the target summary sequence

    Returns:
        Dictionary containing the tokenized input sequence and target summary sequence
    '''
    def __init__(self, tokenizer, data, max_length=512, summary_length=128):
        '''
        Initialize the dataset class
        
        Args:
            tokenizer: PreTrainedTokenizer for tokenizing the text data
            data: DataFrame containing the text data
            max_length: Maximum length of the input sequence
            summary_length: Length of the target summary sequence
            
        Returns:
            None
        '''
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length
        self.summary_length = summary_length

    def __len__(self):
        '''
        Return the length of the dataset
        
        Args:
            None
            
        Returns:
            Length of the dataset
        '''
        return len(self.data)

    def __getitem__(self, idx):
        '''
        Return the tokenized input sequence and target summary sequence
        
        Args:
            idx: Index of the dataset
            
        Returns:
            Dictionary containing the tokenized input sequence and target summary sequence
        '''
        item = self.data.iloc[idx]
        thread = item['body']  # Assuming 'body' contains the email threads
        summary = item['summary']
        
        model_input = self.tokenizer(thread, max_length=self.max_length, truncation=True, padding='max_length', return_tensors="pt")
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(summary, max_length=self.summary_length, truncation=True, padding='max_length', return_tensors="pt")
        
        model_input["labels"] = labels["input_ids"].squeeze()
        
        return {key: val.squeeze() for key, val in model_input.items()}

# Initialize tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# Split dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Prepare train and validation datasets
train_dataset = EmailDataset(tokenizer, train_df)
val_dataset = EmailDataset(tokenizer, val_df)

# Initialize DataLoader for train and validation sets
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Increased batch size for faster training
val_dataloader = DataLoader(val_dataset, batch_size=8)  # Increased batch size for faster evaluation

# Initialize BART model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
bart_model.to(device)

# Evaluation function with ROUGE scores
def evaluate(model, dataloader):
    '''
    Evaluate the model on the validation set using ROUGE scores
    
    Args:
        model: PreTrainedModel for evaluation
        dataloader: DataLoader for the validation set
        
    Returns:
        Dictionary containing ROUGE scores
    '''
    rouge = load_metric("rouge")
    model.eval()
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=128, num_beams=4, early_stopping=True)
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]
            references = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in labels]
            
            rouge.add_batch(predictions=preds, references=references)

    result = rouge.compute()
    return {key: value.mid.fmeasure * 100 for key, value in result.items()}

# Evaluate before fine-tuning
print("Evaluating before fine-tuning...")
before_scores = evaluate(bart_model, val_dataloader)
print(before_scores)

# Prepare optimizer and scheduler
optimizer = AdamW(bart_model.parameters(), lr=5e-5)
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tuning
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    bart_model.train()
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = bart_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

# Evaluate after fine-tuning
print("Evaluating after fine-tuning...")
after_scores = evaluate(bart_model, val_dataloader)
print(after_scores)

print("Model evaluation and fine-tuning complete.")

# Save the fine-tuned model
torch.save(bart_model.state_dict(), '/kaggle/working/fine-tuned_bart_model.pt')

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Evaluating before fine-tuning...


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]



{'rouge1': 20.1922447996289, 'rouge2': 6.297088181855775, 'rougeL': 13.43923420820447, 'rougeLsum': 13.452363592347027}
Epoch 1/5




Epoch 1/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Epoch 2/5


Epoch 2/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Epoch 3/5


Epoch 3/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Epoch 4/5


Epoch 4/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Epoch 5/5


Epoch 5/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Evaluating after fine-tuning...
{'rouge1': 44.83073283674907, 'rouge2': 24.159861008906237, 'rougeL': 32.51058352181336, 'rougeLsum': 32.52312024917855}
Model evaluation and fine-tuning complete.
