In [139]:
#  Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

RANDOM_STATE = 42  # reproducibility



In [140]:
# --------------------------------
# 1) Load the dataset
# --------------------------------
# Expected columns: "Category" (ham/spam) "Message" (Text)
df = pd.read_csv("mail_l7_dataset.csv")

# Basic cleaning: replace NaNs with empty strings (text models can't handle NaN)
df = df.fillna("")
# Encode Labels: spam -> 0, ham -> 1
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1



In [141]:
# --------------------------------
# 2) Split features (X) and target (y)
# --------------------------------
X = df["Message"].astype(str)
y = df["Category"].astype(int)

print(df.head())

  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [128]:
# --------------------------------
# 3) Train/Test Split
# --------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)



In [129]:
# --------------------------------
# 4) Text → TF-IDF features 
# --------------------------------
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)



In [130]:
# --------------------------------
# 5) Train Logistic Regression (baseline)
# --------------------------------
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)


In [142]:
# --------------------------------
# 6) Train Random Forest (convert TF-IDF to dense for tree models)
# --------------------------------
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features.toarray(), y_train)  # convert to dense
rf_pred = rf.predict(X_test_features.toarray())



In [136]:
# --------------------------------
# 7) Train Naive Bayes
# --------------------------------
nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_pred = nb.predict(X_test_features)



In [143]:
# --------------------------------
# 8) Helper functions: metrics + confusion matrix
# --------------------------------
def print_clf_metrics(name, y_true, y_pred, pos_label=0):
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec  = recall_score(y_true, y_pred, pos_label=pos_label)
    f1   = f1_score(y_true, y_pred, pos_label=pos_label)
    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}  (positive = spam=0)")
    print(f"  Recall   : {rec:.3f}  (positive = spam=0)")
    print(f"  F1-Score : {f1:.3f}  (positive = spam=0)")

def print_confmat(name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    cm_df = pd.DataFrame(
        cm,
        index=["Actual: Ham (1)", "Actual: Spam (0)"],
        columns=["Pred: Ham (1)", "Pred: Spam (0)"]
    )
    print(f"\n{name} – Confusion Matrix:\n{cm_df}")

    



In [144]:
# --------------------------------
# 9) Show results for all models
# --------------------------------
for model_name, y_pred in [("Logistic Regression", lr_pred),
                           ("Random Forest", rf_pred),
                           ("Naive Bayes", nb_pred)]:
    print_clf_metrics(model_name, y_test, y_pred, pos_label=0)
    print_confmat(model_name, y_test, y_pred)




Logistic Regression Performance:
  Accuracy : 0.968
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.758  (positive = spam=0)
  F1-Score : 0.863  (positive = spam=0)

Logistic Regression – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             36             113

Random Forest Performance:
  Accuracy : 0.983
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.872  (positive = spam=0)
  F1-Score : 0.932  (positive = spam=0)

Random Forest – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             19             130

Naive Bayes Performance:
  Accuracy : 0.977
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.826  (positive = spam=0)
  F1-Score : 0.904  (positive = spam=0)

Naive Bayes – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Ac

In [145]:
# --------------------------------
# 10) Sanity check – 3 sample messages
# --------------------------------
samples = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket"
]



In [146]:
for text in samples:
    lr_p = lr.predict(tfidf.transform([text]))[0]
    rf_p = rf.predict(tfidf.transform([text]).toarray())[0]
    nb_p = nb.predict(tfidf.transform([text]))[0]
    print(f"\nText: {text}")
    print(f"LR Prediction : {'Spam' if lr_p==0 else 'Ham'}")
    print(f"RF Prediction : {'Spam' if rf_p==0 else 'Ham'}")
    print(f"NB Prediction : {'Spam' if nb_p==0 else 'Ham'}")




Text: Free entry in 2 a weekly competition!
LR Prediction : Ham
RF Prediction : Ham
NB Prediction : Spam

Text: I will meet you at the cafe tomorrow
LR Prediction : Ham
RF Prediction : Ham
NB Prediction : Ham

Text: Congratulations, you won a free ticket
LR Prediction : Ham
RF Prediction : Ham
NB Prediction : Ham
