In [2]:
# Setup

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"Device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu
Device: cpu
CUDA available: False


In [3]:
# Dataset Class

class EmailSpamDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = f"Subject: {row['subject']} [SEP] Body: {row['body']}"

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'url_feature': torch.tensor(row['urls'], dtype=torch.float32),
            'label': torch.tensor(row['label'], dtype=torch.long)
        }
    

In [5]:
# Transformer Model

class SpamDetectionTransformer(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased', num_classes=2, dropout=0.3):
        super(SpamDetectionTransformer, self).__init__()

        self.transformer = AutoModel.from_pretrained(model_name)
        self.hidden_size = self.transformer.config.hidden_size

        self.feature_fc = nn.Linear(1, 32)

        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_size + 32, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )

    def forward(self, input_ids, attention_mask, url_feature):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        pooled_output = outputs.last_hidden_state[:, 0, :]

        url_embedding = torch.relu(self.feature_fc(url_feature.unsqueeze(1)))
        combined = torch.cat([pooled_output, url_embedding], dim=1)

        logits = self.classifier(combined)
        return logits
    

In [6]:
# Training Function

def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    predictions, true_labels = [], []

    progress_bar = tqdm(dataloader, desc='Training')
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        url_feature = batch['url_feature'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, url_feature)

        loss = criterion(logits, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)

    return avg_loss, accuracy


In [7]:
# Evaluation Function

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc='Evaluating')
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            url_feature = batch['url_feature'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask, url_feature)
            loss = criterion(logits, labels)

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average='binary'
    )

    return avg_loss, accuracy, precision, recall, f1, predictions, true_labels


In [8]:
# Preperation of data and model

MODEL_NAME = 'distilbert-base-uncased'
MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

CEAS_08 = pd.read_csv("datasets/CEAS_08.csv", encoding="utf-8", on_bad_lines="skip")
ENRON = pd.read_csv("datasets/Enron.csv", encoding="utf-8", on_bad_lines="skip")
LING = pd.read_csv("datasets/Ling.csv", encoding="utf-8", on_bad_lines="skip")
NAZARIO = pd.read_csv("datasets/Nazario.csv", encoding="utf-8", on_bad_lines="skip")
NAZARIO_5 = pd.read_csv("datasets/Nazario_5.csv", encoding="utf-8", on_bad_lines="skip")
NIGERIAN_FRAUD = pd.read_csv("datasets/Nigerian_Fraud.csv", encoding="utf-8", on_bad_lines="skip")
NIGERIAN_5 = pd.read_csv("datasets/Nigerian_5.csv", encoding="utf-8", on_bad_lines="skip")
SPAMASSASSIN = pd.read_csv("datasets/SpamAssasin.csv", encoding="utf-8", on_bad_lines="skip")

dfs = [CEAS_08, ENRON, LING, NAZARIO, NAZARIO_5, NIGERIAN_FRAUD, NIGERIAN_5, SPAMASSASSIN]
df = pd.concat(dfs, axis=0, ignore_index=True)

print("Cleaning Data")

print(f"Filling {df['sender'].isnull().sum()} empty senders")
df['sender'] = df['sender'].fillna("[NO_SENDER]")

print("Removing receiver (not useful for classification)")
df = df.drop('sender', axis=1)
df = df.drop('receiver', axis=1)
df = df.drop('date', axis=1)

""" df['date'] = pd.to_datetime(df['date'], errors="coerce", utc=True)
date_missing = df['date'].isnull().sum()
print(f"Filling {date_missing} dates")
if date_missing > 0:
    median_date = df['date'].median()
    df['date'] = df['date'].fillna(median_date) """

both_missing = df['subject'].isnull() & df['body'].isnull()
print(f"Dropping {both_missing.sum()} rows with both subject and body missing")
if both_missing.sum() > 0:
    print("Dropping rows with no text content at all...")
    df = df[~both_missing]

print(f"Filling {df['subject'].isnull().sum()} empty subjects")
df['subject'] = df['subject'].fillna('[NO_SUBJECT]')

print(f"Filling {df['body'].isnull().sum()} empty bodies")
df['body'] = df['body'].fillna('[NO_BODY]')

print(f"Filling {df['urls'].isnull().sum()} empty URLs")
df['urls'] = df['urls'].apply(
    lambda x: 0 if x == '[]' or x == 0 else 1
)

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

train_dataset = EmailSpamDataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = EmailSpamDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = EmailSpamDataset(test_df, tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model = SpamDetectionTransformer(model_name=MODEL_NAME)
model = model.to(device)

print(f"Model loaded with {sum(p.numel() for p in model.parameters())} parameters")


Cleaning Data
Filling 33297 empty senders
Removing receiver (not useful for classification)
Dropping 0 rows with both subject and body missing
Filling 503 empty subjects
Filling 1 empty bodies
Filling 32626 empty URLs
Model loaded with 66601154 parameters


In [None]:
# Preperation for training

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.1)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)


In [None]:
# Training Loop

best_val_f1 = 0
train_losses, val_losses = [], []
train_accs, val_accs = [], []

for epoch in range(EPOCHS):
    print(f"{'='*50}")
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print(f"{'='*50}")

    # Training
    train_loss, train_acc = train_epoch(
        model, train_loader, optimizer, scheduler, criterion, device
    )

    # Validation
    val_loss, val_acc, val_prec, val_rec, val_f1, _, _ = evaluate(
        model, val_loader, criterion, device
    )

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accs.append(train_acc)
    val_accs.append(val_acc)

    print(f"\nTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    print(f"Val Precision: {val_prec:.4f} | Val Recall: {val_rec:.4f} | Val F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), 'models/best_spam_model.pt')
        print(f"✓ Saved best model with F1: {val_f1:.4f}")


In [None]:
# Evaluation

if device.type == 'cpu':
    model.load_state_dict(torch.load('models/best_spam_model.pt', map_location=torch.device('cpu')))
else:
    model.load_state_dict(torch.load('models/best_spam_model.pt'))

test_loss, test_acc, test_prec, test_rec, test_f1, predictions, true_labels = evaluate(model, test_loader, criterion, device)

print(f"{'='*50}")
print(f"Test Set Results")
print(f"{'='*50}")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_prec:.4f}")
print(f"Test Recall: {test_rec:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

print(f"\nConfusion Matrix:")
cm = confusion_matrix(true_labels, predictions)
print(cm)

print(f"\nClassification Report")
print(classification_report(true_labels, predictions, target_names=['Ham', 'Spam']))

Evaluating: 100%|██████████| 647/647 [17:32<00:00,  1.63s/it]

Test Set Results
Test Loss: 0.0100
Test Accuracy: 0.9976
Test Precision: 0.9976
Test Recall: 0.9978
Test F1 Score: 0.9977

Confusion Matrix:
[[4953   13]
 [  12 5361]]

Classification Report
              precision    recall  f1-score   support

         Ham       1.00      1.00      1.00      4966
        Spam       1.00      1.00      1.00      5373

    accuracy                           1.00     10339
   macro avg       1.00      1.00      1.00     10339
weighted avg       1.00      1.00      1.00     10339






: 