In [5]:
# Imports and Setup
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

RANDOM_STATE = 42  # For reproducibility

In [None]:
# Load and Clean Dataset
# Expected columns: "Category" (ham/spam), "Message" (text)
df = pd.read_csv("mail_l7_dataset.csv")

# Basic cleaning: replace NaNs with empty strings.
df = df.where(pd.notnull(df), "")

# Encode labels: spam -> 0, ham -> 1 
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

df.head()

In [8]:
# Split features (X) and target (y)
X = df["Message"].astype(str)
y = df["Category"].astype(int)

In [9]:
# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print("=== SPLIT SIZES ===")
print("Train:", X_train.shape[0], " | Test:", X_test.shape[0])

=== SPLIT SIZES ===
Train: 4457  | Test: 1115


In [10]:
# Text → TF-IDF features
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

print("\n=== TF-IDF SHAPES ===")
print("X_train:", X_train_features.shape, " | X_test:", X_test_features.shape)


=== TF-IDF SHAPES ===
X_train: (4457, 7440)  | X_test: (1115, 7440)


In [11]:
# Train Logistic Regression (baseline)
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)

In [12]:
# Train Random Forest (advanced)
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features, y_train)
rf_pred = rf.predict(X_test_features.toarray())
# Note: convert sparse matrix to dense with .toarray() for RF

In [27]:
# Train Naive Bayes (alternative)
nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_pred = nb.predict(X_test_features)

In [30]:
# Helper function: Print main metrics
def print_metrics(model_name, y_true, y_pred):
    print(f"\n{model_name} Performance:")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, pos_label=0))
    print("Recall   :", recall_score(y_true, y_pred, pos_label=0))
    print("F1-score :", f1_score(y_true, y_pred, pos_label=0))

# Helper function: Print confusion matrix
def show_confusion(model_name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    print(f"\n{model_name} Confusion Matrix:")
    print(pd.DataFrame(
        cm,
        index=["Actual: Ham (1)", "Actual: Spam (0)"],
        columns=["Pred: Ham (1)", "Pred: Spam (0)"]
    ))

In [31]:
# Show results for all models
print_metrics("Logistic Regression", y_test, lr_pred)
show_confusion("Logistic Regression", y_test, lr_pred)

print_metrics("Random Forest", y_test, rf_pred)
show_confusion("Random Forest", y_test, rf_pred)

print_metrics("Naive Bayes", y_test, nb_pred)
show_confusion("Naive Bayes", y_test, nb_pred)



Logistic Regression Performance:
Accuracy : 0.967713004484305
Precision: 1.0
Recall   : 0.7583892617449665
F1-score : 0.8625954198473282

Logistic Regression Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             36             113

Random Forest Performance:
Accuracy : 0.9829596412556054
Precision: 1.0
Recall   : 0.87248322147651
F1-score : 0.931899641577061

Random Forest Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             19             130

Naive Bayes Performance:
Accuracy : 0.9766816143497757
Precision: 1.0
Recall   : 0.825503355704698
F1-score : 0.9044117647058824

Naive Bayes Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             26             123


In [33]:
# Single-message sanity check (sample predictions)
samples = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket",
    "You have been selected for a $1000 gift card. Claim now!"
]
def lab2str(v):  # readable output
    return "Spam (0)" if v == 0 else "Ham (1)"

print("\n=== SAMPLE MESSAGE PREDICTIONS ===")
for text in samples:
    print("Text snippet:", text)
    print("LR Pred     :", lab2str(int(lr.predict(tfidf.transform([text]))[0])))
    print("RF Pred     :", lab2str(int(rf.predict(tfidf.transform([text]).toarray())[0])))
    print("NB Pred     :", lab2str(int(nb.predict(tfidf.transform([text]))[0])))
    print("---")


=== SAMPLE MESSAGE PREDICTIONS ===
Text snippet: Free entry in 2 a weekly competition!
LR Pred     : Ham (1)
RF Pred     : Ham (1)
NB Pred     : Spam (0)
---
Text snippet: I will meet you at the cafe tomorrow
LR Pred     : Ham (1)
RF Pred     : Ham (1)
NB Pred     : Ham (1)
---
Text snippet: Congratulations, you won a free ticket
LR Pred     : Ham (1)
RF Pred     : Ham (1)
NB Pred     : Ham (1)
---
Text snippet: You have been selected for a $1000 gift card. Claim now!
LR Pred     : Spam (0)
RF Pred     : Spam (0)
NB Pred     : Spam (0)
---
