In [53]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

In [3]:
df = pd.read_csv("mail_l7_dataset.csv")

In [5]:
df = df.where(pd.notnull(df), "")

In [7]:
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

In [9]:
print(df.head())

  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [11]:
X = df["Message"].astype(str)

In [13]:
y = df["Category"].astype(int)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print("Train:", X_train.shape[0], " | Test:", X_test.shape[0])

Train: 4457  | Test: 1115


In [21]:
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

In [23]:
X_train_features = tfidf.fit_transform(X_train)

In [25]:
print("X_train:", X_train_features.shape)

X_train: (4457, 7440)


In [27]:
X_test_features  = tfidf.transform(X_test)

In [29]:
print("X_test:", X_test_features.shape)

X_test: (1115, 7440)


In [39]:
# logistic Regression
logisticR = LogisticRegression(max_iter=1000, random_state=42)

In [43]:
logisticR.fit(X_train_features, y_train)

In [45]:
logr_pred = logisticR.predict(X_test_features)

In [47]:
# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)

In [49]:
rf.fit(X_train_features, y_train)

In [51]:
rf_pred = rf.predict(X_test_features.toarray())

In [55]:
# Naive Bayes (MultinomialNB)
nb = MultinomialNB()

In [57]:
nb.fit(X_train_features, y_train)

In [59]:
nb_pred = nb.predict(X_test_features)

In [65]:
def evaluate_model(name, y_true, y_pred):
    print(f"--- {name} ---")
    print("Accuracy       :", accuracy_score(y_true, y_pred))
    print("Precision      :", precision_score(y_true, y_pred))
    print("Recall         :", recall_score(y_true, y_pred))
    print("F1-Score       :", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\n")

In [67]:
evaluate_model("Logistic Regression", y_test, logr_pred)

--- Logistic Regression ---
Accuracy       : 0.967713004484305
Precision      : 0.9640718562874252
Recall         : 1.0
F1-Score       : 0.9817073170731707
Confusion Matrix:
 [[113  36]
 [  0 966]]




In [69]:
evaluate_model("Random Forest", y_test, rf_pred)

--- Random Forest ---
Accuracy       : 0.9829596412556054
Precision      : 0.9807106598984772
Recall         : 1.0
F1-Score       : 0.9902614044079959
Confusion Matrix:
 [[130  19]
 [  0 966]]




In [71]:
evaluate_model("Naive Bayes", y_test, nb_pred)

--- Naive Bayes ---
Accuracy       : 0.9766816143497757
Precision      : 0.9737903225806451
Recall         : 1.0
F1-Score       : 0.9867211440245148
Confusion Matrix:
 [[123  26]
 [  0 966]]




In [104]:
# Sanity Check Messages
examples = [
    "Free entry in 2 a weekly competition!",   
    "I will meet you at the cafe tomorrow",   
    "Congratulations, you won a free ticket"   
]

In [106]:
example_features = tfidf.transform(examples)

In [108]:
lr_preds = logisticR.predict(example_features)

In [110]:
rf_preds = rf.predict(example_features.toarray())

In [112]:
nb_preds = nb.predict(example_features)

In [114]:
def lab2str(v):
    return "Spam (0)" if v == 0 else "Ham (1)"

In [116]:
print("\n=== SANITY CHECK ===")
for i, msg in enumerate(examples):
    print(f"Message: {msg}")
    print(f"  Logistic Regression → {lab2str(lr_preds[i])}")
    print(f"  Random Forest       → {lab2str(rf_preds[i])}")
    print(f"  Naive Bayes         → {lab2str(nb_preds[i])}")
    print("-"*60)


=== SANITY CHECK ===
Message: Free entry in 2 a weekly competition!
  Logistic Regression → Ham (1)
  Random Forest       → Ham (1)
  Naive Bayes         → Spam (0)
------------------------------------------------------------
Message: I will meet you at the cafe tomorrow
  Logistic Regression → Ham (1)
  Random Forest       → Ham (1)
  Naive Bayes         → Ham (1)
------------------------------------------------------------
Message: Congratulations, you won a free ticket
  Logistic Regression → Ham (1)
  Random Forest       → Ham (1)
  Naive Bayes         → Ham (1)
------------------------------------------------------------
