### Necessary Installations

### Importing the Libraries

In [2]:
import torch
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from datasets import load_metric
from sklearn.model_selection import train_test_split
import torch.nn as nn
from bert_score import score as bert_score
from tqdm.auto import tqdm
from transformers import logging as hf_logging
from random import sample

# Set Transformers logger to error only to suppress warnings
hf_logging.set_verbosity_error()

In [3]:
# Load dataset from CSV
df = pd.read_csv('/kaggle/input/emaildata/merged_email_data.csv')

# Custom dataset class
class EmailDataset(Dataset):
    """
    A custom PyTorch Dataset for email summarization.

    Attributes:
        tokenizer (PreTrainedTokenizer): Tokenizer for processing text.
        data (DataFrame): DataFrame containing the dataset.
        max_length (int): Maximum length of the tokenized input text.
        summary_length (int): Maximum length of the tokenized summary text.
    """
    
    def __init__(self, tokenizer, data, max_length=512, summary_length=128):
        """
        Initializes the Dataset object with data and configuration.
        """
        
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length
        self.summary_length = summary_length

    def __len__(self):
        """Returns the number of items in the dataset."""
        
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieves an item by its index.

        Args:
            idx (int): Index of the item to retrieve.

        Returns:
            A dictionary containing input_ids, attention_mask, and labels for the model.
        """
        
        item = self.data.iloc[idx]
        thread = item['body'] # Email thread text.
        summary = item['summary'] # Summary text.
        
        # Tokenize the email thread.
        model_input = self.tokenizer(thread, max_length=self.max_length, truncation=True, padding='max_length', return_tensors="pt")
        
        # Tokenize the summary.
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(summary, max_length=self.summary_length, truncation=True, padding='max_length', return_tensors="pt")
        
        model_input["labels"] = labels["input_ids"].squeeze()
        
        return {key: val.squeeze() for key, val in model_input.items()}

# Initialize the tokenizer for BART.
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# Split dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Initialize the custom Dataset objects for training and validation.
train_dataset = EmailDataset(tokenizer, train_df)
val_dataset = EmailDataset(tokenizer, val_df)
print("Prepared Dataset")

# Initialize DataLoader objects for batch processing of training and validation datasets.
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Increased batch size for faster training
val_dataloader = DataLoader(val_dataset, batch_size=8)  # Increased batch size for faster evaluation

# Prepare the BART model and move it to the appropriate device (GPU or CPU).
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
bart_model.to(device)

# Evaluation function with ROUGE scores
def evaluate(model, dataloader, device):
    """
    Evaluates the model on a given dataset using ROUGE and BERTScore metrics.

    Args:
        model (PreTrainedModel): The model to evaluate.
        dataloader (DataLoader): DataLoader providing the dataset for evaluation.
        device (torch.device): The device to run the evaluation on.

    Returns:
        A tuple of dictionaries containing ROUGE scores and BERT scores.
    """
    
    rouge = load_metric("rouge")
    bert_scores = {"precision": [], "recall": [], "f1": []}  # To store BERTScore
    model.eval() # Set the model to evaluation mode.
    
    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
    
    all_preds = []
    all_references = []

    for batch in progress_bar:
        # Move batch to the specified device.
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            # Generate summaries.
            generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=128, num_beams=4, early_stopping=True)
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]
            references = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in labels]
            
            all_preds.extend(preds)
            all_references.extend(references)

            # Add predictions and references to ROUGE for batch evaluation.
            rouge.add_batch(predictions=preds, references=references)

    # BERTScore evaluation on a random subset to manage computation time.
    sample_size = 100  # Number of samples to evaluate BERTScore on.
    if len(all_preds) > sample_size:
        sampled_indices = sample(range(len(all_preds)), sample_size)
        sampled_preds = [all_preds[i] for i in sampled_indices]
        sampled_references = [all_references[i] for i in sampled_indices]
    else:
        sampled_preds, sampled_references = all_preds, all_references

    # Calculate BERTScore for the sampled predictions and references.
    P, R, F1 = bert_score(sampled_preds, sampled_references, lang="en", verbose=False)
    bert_scores["precision"].extend(P.tolist())
    bert_scores["recall"].extend(R.tolist())
    bert_scores["f1"].extend(F1.tolist())

    # Calculate average BERT scores.
    rouge_result = rouge.compute()
    rouge_scores = {key: value.mid.fmeasure * 100 for key, value in rouge_result.items()}

    avg_bert_scores = {
        "precision": sum(bert_scores["precision"]) / len(bert_scores["precision"]),
        "recall": sum(bert_scores["recall"]) / len(bert_scores["recall"]),
        "f1": sum(bert_scores["f1"]) / len(bert_scores["f1"]),
    }

    return rouge_scores, avg_bert_scores


# Perform evaluation before fine-tuning to establish a baseline.
print("Evaluating before fine-tuning...")
before_rouge_scores, before_bert_scores = evaluate(bart_model, val_dataloader, device)
print("ROUGE Scores:", before_rouge_scores)
print("BERT Scores:", before_bert_scores)


# Prepare the optimizer and learning rate scheduler for fine-tuning.
optimizer = AdamW(bart_model.parameters(), lr=5e-5)
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


# Fine-tuning loop.
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    bart_model.train() # Set the model to training mode.
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        optimizer.zero_grad() # Clear previous gradients.
        
        # Move batch to the specified device.
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass: compute predictions and loss.
        outputs = bart_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward() # Backward pass: compute gradient of the loss with respect to model parameters.
        optimizer.step() # Update parameters.
        scheduler.step() # Update learning rate schedule.

        
# Perform evaluation after fine-tuning to see improvements.
print("Evaluating after fine-tuning...")
after_rouge_scores, after_bert_scores = evaluate(bart_model, val_dataloader, device)
print("ROUGE Scores:", after_rouge_scores)
print("BERT Scores:", after_bert_scores)

print("Model evaluation and fine-tuning complete.")


# Save the fine-tuned model
torch.save(bart_model.state_dict(), '/kaggle/working/fine-tuned_bart_model.pt')

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Prepared Dataset


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Evaluating before fine-tuning...


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Evaluating:   0%|          | 0/272 [00:00<?, ?it/s]



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

ROUGE Scores: {'rouge1': 20.182589220382653, 'rouge2': 6.30270462679845, 'rougeL': 13.441462805083676, 'rougeLsum': 13.451910632836627}
BERT Scores: {'precision': 0.7851705265045166, 'recall': 0.831574295759201, 'f1': 0.8070542371273041}
Epoch 1/5




Epoch 1/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Epoch 2/5


Epoch 2/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Epoch 3/5


Epoch 3/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Epoch 4/5


Epoch 4/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Epoch 5/5


Epoch 5/5:   0%|          | 0/2440 [00:00<?, ?it/s]

Evaluating after fine-tuning...


Evaluating:   0%|          | 0/272 [00:00<?, ?it/s]

ROUGE Scores: {'rouge1': 44.502404179489844, 'rouge2': 23.63224348670577, 'rougeL': 32.01880399850611, 'rougeLsum': 32.043367763927286}
BERT Scores: {'precision': 0.8973617559671402, 'recall': 0.8901673817634582, 'f1': 0.893643033504486}
Model evaluation and fine-tuning complete.
