In [6]:
pip install transformers torch pandas scikit-learn kagglehub



In [7]:
import pandas as pd
from transformers import DistilBertTokenizer
import kagglehub
path = kagglehub.dataset_download("ashfakyeafi/spam-email-classification")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/ashfakyeafi/spam-email-classification/versions/3


In [8]:
pip install kaggle



In [9]:
from google.colab import files
import os
uploaded = files.upload()

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [10]:
#checking the import was successful
!ls

sample_data


In [11]:
!kaggle datasets download -d ashfakyeafi/spam-email-classification
!unzip spam-email-classification.zip

Dataset URL: https://www.kaggle.com/datasets/ashfakyeafi/spam-email-classification
License(s): apache-2.0
Archive:  spam-email-classification.zip
  inflating: email.csv               


In [12]:
df = pd.read_csv('email.csv')
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


## PreProcessing Data Set



In [13]:
df['Category'] = df['Category'].str.lower().str.strip()  # Convert to lowercase and remove whitespace
df = df[df['Category'].isin(['spam', 'ham'])]  # Keep only valid categories
df = df.dropna(subset=['Category'])  # Drop rows with NaN in Category
df['Category'] = df['Category'].map({'spam': 1, 'ham': 0})
texts = df['Message'].tolist()
labels = df['Category'].tolist()
print("Unique labels after cleaning:", set(labels))
print("Any NaN values?", any(pd.isna(x) for x in labels))

Unique labels after cleaning: {0, 1}
Any NaN values? False


## Integrating LLM
For this self-project, I am going to be using DistilBertTokenizer from Hugging Face to train and classify email into either spam or ham.

In [15]:
from transformers import DistilBertTokenizer
from transformers import MobileBertTokenizer, MobileBertForSequenceClassification
tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")

import pickle
# Pre-tokenize
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
with open("tokenized_data.pkl", "wb") as f:
    pickle.dump(encodings, f)
# Load during training
with open("tokenized_data.pkl", "rb") as f:
    encodings = pickle.load(f)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

In [16]:
pip install torch



In [17]:
import torch

class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)

In [18]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjoonseoyang0613[0m ([33mjoonseoyang0613-uc-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Training Phase

This is where I trainned the DistilBert model using the train, test split dataset that I attained from kaggle with message content and label on each message as either spam or ham.

In [19]:
model = MobileBertForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=2)

# Create dataset
dataset = EmailDataset(encodings, labels)

# Split into train/test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Training arguments (optimized as discussed earlier)
from transformers import TrainingArguments, Trainer, DistilBertForSequenceClassification
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    report_to="wandb",

)

# Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.config.problem_type = "single_label_classification"

# Train
from transformers import Adafactor
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=2e-5), None)
)

trainer.train()

pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.1386
200,0.0445


Step,Training Loss
100,0.1386
200,0.0445
300,0.0516
400,0.0383
500,0.0358


TrainOutput(global_step=558, training_loss=0.05616378057814841, metrics={'train_runtime': 6085.061, 'train_samples_per_second': 1.465, 'train_steps_per_second': 0.092, 'total_flos': 295203597904896.0, 'train_loss': 0.05616378057814841, 'epoch': 2.0})

In [23]:
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
from sklearn.metrics import classification_report

# Access labels from the test dataset, not the original dataset
# test_dataset.dataset.labels would have referred to the labels of the full dataset
print(classification_report([test_dataset.dataset.labels[i] for i in test_dataset.indices], preds, target_names=["ham", "spam"]))


              precision    recall  f1-score   support

         ham       0.99      1.00      1.00       940
        spam       0.99      0.97      0.98       175

    accuracy                           0.99      1115
   macro avg       0.99      0.99      0.99      1115
weighted avg       0.99      0.99      0.99      1115



## Overview of the Output
The training took 558 steps with approximate running time of 1.7 hours. The total loss from training is 0.05 which is very low, indicating that the model has learned well from the dataset. The training samples per second was 1.465 while training steps per second was 0.092. These two metrics are quite low, indicating inefficiency in hardware utilization.

# Classification Report
There are two classes: ham (non spam) & spam.

Within the dataset, there are total of 940 ham samples and 175 spam samples totalling of 1115 samples. This is a decent spread within the dataset, but still demonstrates a high concentration of ham email samples than spam. This raises a concern of undersampling spam emails.

1. Class-wise Performance


Ham (Non-Spam Emails):

Precision: 0.99 → 99% of emails predicted as ham were actually ham. Very high, meaning the model rarely misclassifies a spam email as ham (false positives are rare).
Recall: 1.00 → 100% of actual ham emails were correctly identified as ham. Perfect recall, meaning no ham emails were missed (no false negatives).
F1-Score: 1.00 → Perfect balance of precision and recall for ham.
Support: 940 samples, which is expected since the dataset is imbalanced (more ham than spam).

Spam Emails:

Precision: 0.99 → 99% of emails predicted as spam were actually spam. Very high, meaning the model rarely misclassifies a ham email as spam (false positives are rare).
Recall: 0.97 → 97% of actual spam emails were correctly identified as spam. Slightly lower than for ham, meaning a few spam emails were missed (false negatives exist).
F1-Score: 0.98 → Still very high, but slightly lower than ham due to the lower recall.
Support: 175 samples, reflecting the class imbalance (fewer spam emails).

2. Overall Performance

Accuracy: 0.99 → 99% of all predictions were correct. This is an excellent result, but we need to be cautious because the dataset is imbalanced (84% ham, 16% spam). Accuracy can be misleading in such cases.
Macro Average:
Precision, Recall, F1-Score: All 0.99. This averages the metrics across classes without considering class imbalance, showing that the model performs well for both classes.
Weighted Average:
Precision, Recall, F1-Score: All 0.99. This weights the metrics by the support of each class, confirming strong performance even when accounting for the imbalance.

3. Key Insights

High Performance: The model (MobileBERT) achieves excellent results across all metrics, with 99% accuracy, precision, recall, and F1-score. This suggests that the model has learned to distinguish between ham and spam emails very effectively.

Class Imbalance: The dataset is imbalanced (940 ham vs. 175 spam in the test set, roughly 84% ham). Despite this, the model performs well on the minority class (spam), with a recall of 0.97, meaning it misses only a small fraction of spam emails.

False Negatives for Spam: The recall for spam (0.97) indicates that 3% of spam emails were misclassified as ham. For 175 spam emails, this means approximately 175 * 0.03 ≈ 5 spam emails were missed. In a real-world email filter, this could mean a few spam emails reach the inbox, which might be acceptable depending on the use case.

False Positives: The precision for both classes is 0.99, meaning false positives (e.g., ham classified as spam) are rare. This is critical for an email filter, as users don’t want legitimate emails sent to the spam folder.

### Testing

In [28]:
def classify_email(email_text):
    inputs = tokenizer(email_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    if 'token_type_ids' in inputs:
      del inputs['token_type_ids']

    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = probs.argmax().item()
    return "spam" if pred == 1 else "ham"

# Test it
email = "Win a free iPhone now!!! Click here!"
print("likeley " + classify_email(email) + "!")  # Likely "spam"

likeley spam!
