In [1]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
#from tqdm import tqdm
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [2]:
CSV_PATH = "unified_phishing_dataset.csv"

df = pd.read_csv(CSV_PATH)

print("Raw columns:", df.columns.tolist())
print(df.head())

# Keep only what we need
df = df[["subject", "body", "label"]]

# Drop rows with missing text or label
df = df.dropna(subset=["subject", "body", "label"])

# Combine subject + body into one text field
df["text"] = df["subject"].astype(str) + " " + df["body"].astype(str)

# Make sure labels are ints (0 or 1)
df["label"] = df["label"].astype(int)

print("Dataset size:", len(df))
print(df[["text", "label"]].head())

Raw columns: ['email_id', 'sender', 'receiver', 'date', 'subject', 'body', 'urls', 'label', 'source_dataset']
    email_id                                             sender  \
0  CEAS_08_0                   young esposito <young@iworld.de>   
1  CEAS_08_1                       mok <ipline's1983@icable.ph>   
2  CEAS_08_2  daily top 10 <karmandeep-opengevl@universalnet...   
3  CEAS_08_3                 michael parker <ivqrnai@pobox.com>   
4  CEAS_08_4  gretchen suggs <externalsep1@loanofficertool.com>   

                                         receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  spamassassin dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   

                              date  \
0  Tue, 05 Aug 2008 16:31:02 -0700   
1  Tue, 05 Aug 2008 18:31:03 -0500   
2  Tue, 05 Aug 2008 20:28:00 -1200   
3  Tue, 05 Aug 2

In [3]:
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

print("Train size:", len(train_df))
print("Val size:", len(val_df))
train_df.head()

Train size: 61072
Val size: 15269


Unnamed: 0,subject,body,label,text
35921,CNN Alerts: My Custom Alert,CNN Alerts: My Custom Alert\n\n\n\n\n\n\n \n\n...,1,CNN Alerts: My Custom Alert CNN Alerts: My Cus...
5507,You do not want to buy their shops unknown in ...,Man's improving formula effective for for 90% ...,1,You do not want to buy their shops unknown in ...
13554,123,There�s something in store for you! Fist-rate ...,1,123 There�s something in store for you! Fist-r...
57122,= ? iso - 8859 - 7 ? q ? = 5 b = 3 f = 5 d _ l...,"premiere source for x : a : n : a : x , v : a ...",1,= ? iso - 8859 - 7 ? q ? = 5 b = 3 f = 5 d _ l...
3378,Re: [Python-3000] Python 2.6 and 3.0,"Barry Warsaw wrote:\n> From the follow ups, it...",0,Re: [Python-3000] Python 2.6 and 3.0 Barry War...


In [4]:
def simple_tokenize(text):
    """
    Basic tokenizer: lowercase and split on whitespace after removing junk.
    """
    text = text.lower()
    text = re.sub(r"[^a-z0-9@.\-_/ ]+", " ", text)
    tokens = text.split()
    return tokens


# Build vocab from training set only
word_freq = {}
for t in train_df["text"]:
    for tok in simple_tokenize(t):
        word_freq[tok] = word_freq.get(tok, 0) + 1

# Drop super-rare words
min_freq = 2
vocab_words = [w for w, f in word_freq.items() if f >= min_freq]

# Reserve 0 for PAD, 1 for UNK
itos = ["<PAD>", "<UNK>"] + sorted(vocab_words)
stoi = {w: i for i, w in enumerate(itos)}

vocab_size = len(itos)
print("Vocab size:", vocab_size)
print("Sample vocab items:", list(stoi.items())[:20])


def numericalize(tokens, stoi_map, unk_idx=1):
    return [stoi_map.get(tok, unk_idx) for tok in tokens]

Vocab size: 201623
Sample vocab items: [('<PAD>', 0), ('<UNK>', 1), ('-', 2), ('--', 3), ('---', 4), ('----', 5), ('-----', 6), ('------', 7), ('-------', 8), ('--------', 9), ('---------', 10), ('----------', 11), ('-----------', 12), ('------------', 13), ('-------------', 14), ('--------------', 15), ('---------------', 16), ('----------------', 17), ('-----------------', 18), ('------------------', 19)]


In [5]:
class EmailDataset(Dataset):
    def __init__(self, df, stoi_map):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.stoi_map = stoi_map
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        raw_text = self.texts[idx]
        label = self.labels[idx]

        toks = simple_tokenize(raw_text)
        ids = numericalize(toks, self.stoi_map)

        return torch.tensor(ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)


def collate_batch(batch):
    """
    Pad sequences to max length in batch.
    """
    (seqs, labels) = zip(*batch)
    padded_seqs = pad_sequence(seqs, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return padded_seqs, labels

In [6]:
MAX_SEQ_LENGTH = 512  # Maximum tokens per email (adjust if needed)

class EmailDataset(Dataset):
    def __init__(self, df, stoi_map, max_length=512):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.stoi_map = stoi_map
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        raw_text = self.texts[idx]
        label = self.labels[idx]

        toks = simple_tokenize(raw_text)
        
        # TRUNCATE to max_length to prevent OOM
        if len(toks) > self.max_length:
            toks = toks[:self.max_length]
        
        ids = numericalize(toks, self.stoi_map)

        return torch.tensor(ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)


def collate_batch(batch):
    """
    Pad sequences to max length in batch.
    """
    (seqs, labels) = zip(*batch)
    padded_seqs = pad_sequence(seqs, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return padded_seqs, labels


# Create datasets with max_length limit
train_dataset = EmailDataset(train_df, stoi, max_length=MAX_SEQ_LENGTH)
val_dataset = EmailDataset(val_df, stoi, max_length=MAX_SEQ_LENGTH)

BATCH_SIZE = 32

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_batch
)

# Sanity check
batch_x, batch_y = next(iter(train_loader))
print("="*60)
print("DataLoader Configuration")
print("="*60)
print(f"Batch size: {BATCH_SIZE}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"\nSample batch:")
print(f"  Batch X shape: {batch_x.shape}")  # Should be [32, <=512]
print(f"  Batch y shape: {batch_y.shape}")  # Should be [32]
print(f"  Actual max length in batch: {batch_x.shape[1]}")
print(f"  First 50 token ids: {batch_x[0][:50]}")
print(f"  First label: {batch_y[0]}")
print("="*60)

DataLoader Configuration
Batch size: 32
Max sequence length: 512
Train batches: 1909
Val batches: 478

Sample batch:
  Batch X shape: torch.Size([32, 512])
  Batch y shape: torch.Size([32])
  Actual max length in batch: 512
  First 50 token ids: tensor([ 55461,  62223, 184472,  12088, 182324,  62223, 184472,  12088,  89813,
         55461, 184472, 192530,  37054, 176616,  39482, 142868,  40710,  11582,
         16331,  19805,  24900, 151360,  74748, 184472,  12088, 192530,  11711,
        135273, 135251, 103654,   5116, 190104,   8212, 110453, 110898,  31193,
         67053,  70310, 110453, 110898,  31193, 187014, 110453, 110898, 182324,
        135273, 135251,  55459, 165245, 113438])
  First label: 1


In [7]:
class CustomCNN_Text(nn.Module):
    """
    Custom CNN for text classification with medium depth.
    Architecture: Embedding + 4 Conv1d blocks + 3 FC layers
    MATCHES the structure of CustomCNN_Image but with Conv1d instead of Conv2d
    """
    def __init__(self, vocab_size, embed_dim=128, num_classes=2, dropout=0.5):
        super().__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Block 1: embed_dim -> 64 channels
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.relu1 = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # Block 2: 64 -> 128 channels
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(128)
        self.relu2 = nn.ReLU(inplace=True)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # Block 3: 128 -> 256 channels
        self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(256)
        self.relu3 = nn.ReLU(inplace=True)
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # Block 4: 256 -> 512 channels
        self.conv4 = nn.Conv1d(256, 512, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm1d(512)
        self.relu4 = nn.ReLU(inplace=True)
        self.pool4 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # Global Average Pooling (adaptive pooling to fixed size)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        
        # Fully Connected Layers (SAME as image model)
        self.fc1 = nn.Linear(512, 256)
        self.relu_fc1 = nn.ReLU(inplace=True)
        self.dropout1 = nn.Dropout(dropout)
        
        self.fc2 = nn.Linear(256, 128)
        self.relu_fc2 = nn.ReLU(inplace=True)
        self.dropout2 = nn.Dropout(dropout)
        
        self.fc3 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        # Embedding: [B, T] -> [B, T, E]
        x = self.embedding(x)
        
        # Transpose for Conv1d: [B, T, E] -> [B, E, T]
        x = x.transpose(1, 2)
        
        # Block 1
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        
        # Block 2
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        
        # Block 3
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.pool3(x)
        
        # Block 4
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.relu4(x)
        x = self.pool4(x)
        
        # Global pooling: [B, 512, T] -> [B, 512, 1]
        x = self.global_pool(x)
        x = x.squeeze(-1)  # [B, 512]
        
        # FC layers
        x = self.fc1(x)
        x = self.relu_fc1(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.relu_fc2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        
        return x


# Initialize model
num_classes = df["label"].nunique()
model = CustomCNN_Text(
    vocab_size=vocab_size,
    embed_dim=128,
    num_classes=num_classes,
    dropout=0.5
).to(device)

print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

CustomCNN_Text(
  (embedding): Embedding(201623, 128, padding_idx=0)
  (conv1): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU(inplace=True)
  (pool3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn4): BatchNorm1d(512

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-3)

print("Criterion:", criterion)
print("Optimizer:", optimizer)

Criterion: CrossEntropyLoss()
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)


In [9]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    for batch_x, batch_y in tqdm(loader, desc="Train", leave=False):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()

        logits = model(batch_x)
        loss = criterion(logits, batch_y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_x.size(0)

        preds = torch.argmax(logits, dim=1)
        total_correct += (preds == batch_y).sum().item()
        total_count += batch_x.size(0)

    avg_loss = total_loss / total_count
    avg_acc = total_correct / total_count
    return avg_loss, avg_acc


def eval_one_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for batch_x, batch_y in tqdm(loader, desc="Val", leave=False):
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)

            logits = model(batch_x)
            loss = criterion(logits, batch_y)

            total_loss += loss.item() * batch_x.size(0)

            preds = torch.argmax(logits, dim=1)
            total_correct += (preds == batch_y).sum().item()
            total_count += batch_x.size(0)

    avg_loss = total_loss / total_count
    avg_acc = total_correct / total_count
    return avg_loss, avg_acc

In [10]:
# Clear GPU memory before training
import gc

torch.cuda.empty_cache()
gc.collect()

print("="*60)
print("GPU Memory Status")
print("="*60)
print(f"Device: {torch.cuda.get_device_name(0)}")
print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
print(f"Available: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / 1e9:.2f} GB")
print("="*60)

GPU Memory Status
Device: NVIDIA GeForce RTX 4060 Laptop GPU
Total Memory: 8.59 GB
Allocated: 0.11 GB
Cached: 0.11 GB
Available: 8.48 GB


In [11]:
EPOCHS = 10

best_val_acc = 0.0
best_state_dict = None

history = {
    "train_loss": [],
    "train_acc": [],
    "val_loss": [],
    "val_acc": []
}

print(f"Starting training for {EPOCHS} epochs...")
print(f"Total training batches: {len(train_loader)}")
print(f"Total validation batches: {len(val_loader)}")
print("="*70)

for epoch in range(EPOCHS):
    print(f"\n{'='*70}")
    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"{'='*70}")
    
    # ============================================
    # TRAINING PHASE
    # ============================================
    model.train()
    running_loss = 0.0
    running_correct = 0
    running_total = 0
    
    total_train_batches = len(train_loader)
    print_every = max(1, total_train_batches // 10)  # Print 10 times per epoch
    
    print(f"[TRAINING] Processing {total_train_batches} batches...")
    
    for batch_idx, (batch_x, batch_y) in enumerate(train_loader):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()

        logits = model(batch_x)
        loss = criterion(logits, batch_y)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch_x.size(0)

        preds = torch.argmax(logits, dim=1)
        running_correct += (preds == batch_y).sum().item()
        running_total += batch_x.size(0)
        
        # Print progress
        if (batch_idx + 1) % print_every == 0 or (batch_idx + 1) == total_train_batches:
            current_acc = running_correct / running_total
            progress = (batch_idx + 1) / total_train_batches * 100
            print(f"  [{progress:5.1f}%] Batch {batch_idx+1:4d}/{total_train_batches} | "
                  f"Loss: {loss.item():.4f} | Acc: {current_acc:.4f}")

    train_loss = running_loss / running_total
    train_acc = running_correct / running_total
    
    # ============================================
    # VALIDATION PHASE
    # ============================================
    print(f"\n[VALIDATION] Processing {len(val_loader)} batches...")
    
    model.eval()
    running_loss = 0.0
    running_correct = 0
    running_total = 0
    
    with torch.no_grad():
        for batch_idx, (batch_x, batch_y) in enumerate(val_loader):
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)

            logits = model(batch_x)
            loss = criterion(logits, batch_y)

            running_loss += loss.item() * batch_x.size(0)

            preds = torch.argmax(logits, dim=1)
            running_correct += (preds == batch_y).sum().item()
            running_total += batch_x.size(0)
    
    val_loss = running_loss / running_total
    val_acc = running_correct / running_total
    
    # ============================================
    # SAVE METRICS & PRINT SUMMARY
    # ============================================
    history["train_loss"].append(train_loss)
    history["train_acc"].append(train_acc)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)
    
    print(f"\n{'-'*70}")
    print(f"[Epoch {epoch+1}/{EPOCHS} Summary]")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} ({train_acc*100:.2f}%)")
    print(f"  Val Loss:   {val_loss:.4f}   | Val Acc:   {val_acc:.4f} ({val_acc*100:.2f}%)")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state_dict = {k: v.cpu() for k, v in model.state_dict().items()}
        print(f"  New BEST model! (val_acc={best_val_acc:.4f})")
    else:
        print(f"  (Best val_acc so far: {best_val_acc:.4f})")
    
    print(f"{'-'*70}")

print(f"\n{'='*70}")
print("=== Training Complete ===")
print(f"{'='*70}")
print(f"Best Validation Accuracy: {best_val_acc:.4f} ({best_val_acc*100:.2f}%)")

Starting training for 10 epochs...
Total training batches: 1909
Total validation batches: 478

Epoch 1/10
[TRAINING] Processing 1909 batches...
  [ 10.0%] Batch  190/1909 | Loss: 0.1877 | Acc: 0.8447
  [ 19.9%] Batch  380/1909 | Loss: 0.1261 | Acc: 0.8918
  [ 29.9%] Batch  570/1909 | Loss: 0.1986 | Acc: 0.9113
  [ 39.8%] Batch  760/1909 | Loss: 0.5213 | Acc: 0.9240
  [ 49.8%] Batch  950/1909 | Loss: 0.0385 | Acc: 0.9322
  [ 59.7%] Batch 1140/1909 | Loss: 0.1866 | Acc: 0.9379
  [ 69.7%] Batch 1330/1909 | Loss: 0.1272 | Acc: 0.9417
  [ 79.6%] Batch 1520/1909 | Loss: 0.0075 | Acc: 0.9455
  [ 89.6%] Batch 1710/1909 | Loss: 0.0192 | Acc: 0.9490
  [ 99.5%] Batch 1900/1909 | Loss: 0.0033 | Acc: 0.9522
  [100.0%] Batch 1909/1909 | Loss: 0.4040 | Acc: 0.9523

[VALIDATION] Processing 478 batches...

----------------------------------------------------------------------
[Epoch 1/10 Summary]
  Train Loss: 0.1363 | Train Acc: 0.9523 (95.23%)
  Val Loss:   0.0722   | Val Acc:   0.9775 (97.75%)
  New

In [12]:
if best_state_dict is not None:
    torch.save(best_state_dict, "best_custom_cnn_text_2.pth")
    print("Saved best model -> best_custom_cnn_text_2.pth")

with open("vocab_text_2.json", "w") as f:
    json.dump({"itos": itos}, f, indent=2)
print("Saved vocab_text_2.json")

print("\nTraining curves:")
for i in range(EPOCHS):
    print(
        f"Epoch {i+1:02d}: "
        f"train_acc={history['train_acc'][i]:.4f}, "
        f"val_acc={history['val_acc'][i]:.4f}, "
        f"train_loss={history['train_loss'][i]:.4f}, "
        f"val_loss={history['val_loss'][i]:.4f}"
    )

Saved best model -> best_custom_cnn_text_2.pth
Saved vocab_text_2.json

Training curves:
Epoch 01: train_acc=0.9523, val_acc=0.9775, train_loss=0.1363, val_loss=0.0722
Epoch 02: train_acc=0.9888, val_acc=0.9807, train_loss=0.0362, val_loss=0.0729
Epoch 03: train_acc=0.9947, val_acc=0.9895, train_loss=0.0186, val_loss=0.0405
Epoch 04: train_acc=0.9961, val_acc=0.9866, train_loss=0.0128, val_loss=0.0682
Epoch 05: train_acc=0.9979, val_acc=0.9861, train_loss=0.0076, val_loss=0.0615
Epoch 06: train_acc=0.9984, val_acc=0.9903, train_loss=0.0061, val_loss=0.0485
Epoch 07: train_acc=0.9983, val_acc=0.9878, train_loss=0.0085, val_loss=0.0749
Epoch 08: train_acc=0.9989, val_acc=0.9876, train_loss=0.0044, val_loss=0.0689
Epoch 09: train_acc=0.9993, val_acc=0.9900, train_loss=0.0034, val_loss=0.0772
Epoch 10: train_acc=0.9990, val_acc=0.9887, train_loss=0.0048, val_loss=0.0699
