**1. Install Packages & Mount Google Drive**

In [None]:
!pip install transformers datasets scikit-learn accelerate --quiet

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**2. Load and Preprocess Dataset**

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

dataset_path = "/content/drive/MyDrive/spam_classifier/mail-spam-detector.csv"

df = pd.read_csv(dataset_path, encoding="latin-1")

print("Columns:", df.columns)
print(df.head())


Columns: Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
     v1                                                 v2 Unnamed: 2  \
0  spam  Win the newest Ã¥ÃHarry Potter and the Order ...        NaN   
1   ham         Excellent, I'll see what riley's plans are        NaN   
2  spam  URGENT! Your Mobile number has been awarded wi...        NaN   
3   ham         Somebody should go to andros and steal ice        NaN   
4  spam  You have 1 new voicemail. Please call 08719181503        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [4]:
df = df.rename(columns={df.columns[0]: "label", df.columns[1]: "text"})
df = df.dropna(subset=["label", "text"])
df = df[df["label"].isin(["ham", "spam"])]

label2id = {"ham": 0, "spam": 1}
id2label = {0: "ham", 1: "spam"}
df["label_id"] = df["label"].map(label2id)

print("Cleaned dataset size:", df.shape)
print(df["label"].value_counts())

train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df["label_id"]
)

train_dataset = Dataset.from_pandas(train_df[["text", "label_id"]])
val_dataset   = Dataset.from_pandas(val_df[["text", "label_id"]])

Cleaned dataset size: (12000, 6)
label
ham     8000
spam    4000
Name: count, dtype: int64


**3. Tokenization**

In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset   = val_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.rename_column("label_id", "labels")
val_dataset   = val_dataset.rename_column("label_id", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/10800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

**4. Load Model and Training Arguments**

In [6]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/spam_classifier/email-spam-detector",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    fp16=True,
    do_eval=True,
    eval_strategy="steps",
    load_best_model_at_end=True
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


**5. Metrics and Trainer**

In [7]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


**6. Train the Model**

In [8]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.1732,0.092931,0.975833,1.0,0.9275,0.962387
100,0.0745,0.063724,0.981667,0.992188,0.9525,0.971939
150,0.0671,0.056591,0.983333,0.989691,0.96,0.974619
200,0.0264,0.050747,0.990833,0.992405,0.98,0.986164
250,0.0629,0.039268,0.991667,0.9875,0.9875,0.9875
300,0.0135,0.038889,0.991667,0.982673,0.9925,0.987562
350,0.0192,0.031133,0.993333,0.987562,0.9925,0.990025
400,0.0398,0.044531,0.991667,0.994924,0.98,0.987406
450,0.0287,0.018996,0.9975,0.997494,0.995,0.996245
500,0.0261,0.017626,0.9975,0.995012,0.9975,0.996255


TrainOutput(global_step=2025, training_loss=0.016364938918455148, metrics={'train_runtime': 329.2784, 'train_samples_per_second': 98.397, 'train_steps_per_second': 6.15, 'total_flos': 1072985929113600.0, 'train_loss': 0.016364938918455148, 'epoch': 3.0})

**7. Save Model**

In [12]:
save_path = "/content/drive/MyDrive/spam_classifier/email-spam-detector"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to: {save_path}")

Model and tokenizer saved to: /content/drive/MyDrive/spam_classifier/email-spam-detector


**8. Load Model and Test**

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import os

model_path = "/content/drive/MyDrive/spam_classifier/email-spam-detector"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()




DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [15]:
test_texts = [
    "Thanks for help, here is your 100 dollars! Click this link to claim your reward now",
    "Congratulations! You’ve won $5000! Claim it now by clicking this link",
    "Thanks for help, here is your 100 dollars",
    "Congratulations! You have won a free prize. Click here now!",
    "The delivery was late and the product quality is very poor"
]

inputs = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
    preds = np.argmax(outputs.logits.detach().numpy(), axis=1)

id2label = {0: "ham", 1: "spam"}
for text, pred in zip(test_texts, preds):
    print(f"Text: {text[:80]}...")
    print(f"Prediction: {id2label[pred]}\n")

Text: Thanks for help, here is your 100 dollars! Click this link to claim your reward ...
Prediction: spam

Text: Congratulations! You’ve won $5000! Claim it now by clicking this link...
Prediction: spam

Text: Thanks for help, here is your 100 dollars...
Prediction: ham

Text: Congratulations! You have won a free prize. Click here now!...
Prediction: spam

Text: The delivery was late and the product quality is very poor...
Prediction: ham

