In [1]:
# --------------------------------
# 0) Imports
# --------------------------------
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

In [2]:
# --------------------------------
# 1) Load dataset
# --------------------------------
df = pd.read_csv("mail.csv")
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
# Check for missing values
print(df.isnull().sum())
print(df.isnull().values.any())

Category    0
Message     0
dtype: int64
False


In [4]:
# --------------------------------
# 2) Preprocess labels
# --------------------------------

In [5]:
df.loc[df["Category"].str.lower().str.strip() == "spam" ,"Category"] = 0

In [6]:
df.loc[df["Category"].str.lower().str.strip() == "ham" ,"Category"] = 1

In [7]:
x = df["Message"].astype(str)

In [8]:
y = df["Category"].astype(int)

In [9]:
# --------------------------------
# 3) Split dataset into train/test
# --------------------------------


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [11]:
# Shapes train/test
print("Train:", X_train.shape[0], " |  Test:", X_test.shape[0])

Train: 4457  |  Test: 1115


In [12]:
# --------------------------------
# 4) Feature extraction (TF-IDF)
# --------------------------------

In [13]:
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

In [14]:
# --------------------------------
# 5) Train models
# --------------------------------

In [15]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42 )
lr.fit(X_train_features,y_train)
lr_predict = lr.predict(X_test_features)

In [16]:
# Random Forest
rf = RandomForestClassifier(n_estimators=200,random_state=42)
rf.fit(X_train_features,y_train)
rf_predict=rf.predict(X_test_features.toarray())

In [17]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_features,y_train)
nb_predict= nb.predict(X_test_features)

In [18]:
# --------------------------------
# 6) Define evaluation functions
# --------------------------------

In [19]:
# Function to print performance metrics
def print_metrics(name, y_true, y_predicted, pos_label=0):
    # Calculate accuracy
    ac = accuracy_score(y_true, y_predicted)
    # Calculate precision (correct positive predictions)
    prec = precision_score(y_true, y_predicted, pos_label=pos_label)
    # Calculate recall (coverage of actual positives)
    rec = recall_score(y_true, y_predicted, pos_label=pos_label)
    # Calculate F1 score (balance of precision & recall)
    f1 = f1_score(y_true, y_predicted, pos_label=pos_label)
    # Print metrics
    print(f"{name} Perfomance:")
    print(f"Accuracy: {ac:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall: {rec:.3f}")
    print(f"F1Score: {f1:.3f}")

In [20]:
# Function to print confusion matrix
def print_cm(name, y_true, y_predicted):
    # Generate confusion matrix
    cm = confusion_matrix(y_true, y_predicted, labels=[1,0])
    # Convert to a pandas DataFrame for better readability
    cm_df = pd.DataFrame(
        cm,
        index=["Actual Ham(1)", "Actual Spam(0)"],
        columns=["Pred Ham(1)", "Pred Spam(0)"]
    )
    # Print confusion matrix
    print(f"{name} - Confusion Matrix:\n{cm_df}")

In [21]:
# --------------------------------
# 7) Evaluate models
# --------------------------------

In [22]:
print_metrics("Logistic Regresion: ", y_test, lr_predict)
print_cm("Logistic Regresion: ", y_test, lr_predict)

Logistic Regresion:  Perfomance:
Accuracy: 0.968
Precision: 1.000
Recall: 0.758
F1Score: 0.863
Logistic Regresion:  - Confusion Matrix:
                Pred Ham(1)  Pred Spam(0)
Actual Ham(1)           966             0
Actual Spam(0)           36           113


In [23]:
print_metrics("Random Forest: ", y_test, rf_predict)
print_cm("Random Forest: ", y_test, rf_predict)

Random Forest:  Perfomance:
Accuracy: 0.983
Precision: 1.000
Recall: 0.872
F1Score: 0.932
Random Forest:  - Confusion Matrix:
                Pred Ham(1)  Pred Spam(0)
Actual Ham(1)           966             0
Actual Spam(0)           19           130


In [24]:
print_metrics("Naive Bayes: ", y_test, nb_predict)
print_cm("Naive Bayes: ", y_test, nb_predict)

Naive Bayes:  Perfomance:
Accuracy: 0.977
Precision: 1.000
Recall: 0.826
F1Score: 0.904
Naive Bayes:  - Confusion Matrix:
                Pred Ham(1)  Pred Spam(0)
Actual Ham(1)           966             0
Actual Spam(0)           26           123


In [25]:
# --------------------------------
# 8) Sanity check for 3 random samples
# --------------------------------

In [28]:

# Function to map label number -> string
def lab2str(r):
    return "Spam (0)" if r == 0 else "Ham (1)"

# Pick 3 random indices
#sample_indices = random.sample(range(len(X_test)), 3)
sample_indices = [986, 14, 119, 815, 260, 326]

print("Sanity Check for Random  Samples")
print("="*60)

for i in sample_indices:
    sample_text = X_test.iloc[i]
    true_label = y_test.iloc[i]

    # Truncate text if too long
    display_text = (sample_text[:160] + "...") if len(sample_text) > 160 else sample_text

    # Predictions for 3 models
    lr_pred = int(lr.predict(tfidf.transform([sample_text]))[0])
    rf_pred = int(rf.predict(tfidf.transform([sample_text]))[0])
    nb_pred = int(nb.predict(tfidf.transform([sample_text]))[0])

    # Print results
    print(f"Sample Index : {i}")
    print(f"Message      : {display_text}")
    print(f"Actual       : {lab2str(true_label)}")
    print(f"LR Predict   : {lab2str(lr_pred)}")
    print(f"RF Predict   : {lab2str(rf_pred)}")
    print(f"NB Predict   : {lab2str(nb_pred)}")
    print("-"*60)


Sanity Check for Random  Samples
Sample Index : 986
Message      : FREE2DAY sexy St George's Day pic of Jordan!Txt PIC to 89080 dont miss out, then every wk a saucy celeb!4 more pics c PocketBabe.co.uk 0870241182716 £3/wk
Actual       : Spam (0)
LR Predict   : Ham (1)
RF Predict   : Spam (0)
NB Predict   : Spam (0)
------------------------------------------------------------
Sample Index : 14
Message      : FREE RINGTONE text FIRST to 87131 for a poly or text GET to 87131 for a true tone! Help? 0845 2814032 16 after 1st free, tones are 3x£150pw to e£nd txt stop
Actual       : Spam (0)
LR Predict   : Spam (0)
RF Predict   : Spam (0)
NB Predict   : Spam (0)
------------------------------------------------------------
Sample Index : 119
Message      : Rock yr chik. Get 100's of filthy films &XXX pics on yr phone now. rply FILTH to 69669. Saristar Ltd, E14 9YT 08701752560. 450p per 5 days. Stop2 cancel
Actual       : Spam (0)
LR Predict   : Ham (1)
RF Predict   : Ham (1)
NB Predict   : Ham