In [1]:
# 0: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [3]:
# 1: Load dataset
df = pd.read_csv('mail_l7_dataset.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Encode labels
df.loc[df['Category'].str.lower().str.strip() == 'spam', 'Category'] = 0
df.loc[df['Category'].str.lower().str.strip() == 'ham', 'Category'] = 1

In [6]:
# 3: Split features (X) and target (y)
X = df['Message'].astype(str)
y = df['Category'].astype(int)

# Split into train & test
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print(f"Training samples: {X_train.shape[0]} | Testing samples: {X_test.shape[0]}")


Training samples: 4457 | Testing samples: 1115


In [7]:
# 4: Text vectorization using TF-IDF
tfidf = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features = tfidf.transform(X_test)

X_train_features.shape, X_test_features.shape

((4457, 7440), (1115, 7440))

In [8]:
# 5: Train Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
log_reg.fit(X_train_features, y_train)
log_reg_pred = log_reg.predict(X_test_features)


In [9]:
# 6: Train Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf_clf.fit(X_train_features, y_train)
rf_clf_pred = rf_clf.predict(X_test_features.toarray())


In [10]:
# 7: Train Naive Bayes
nb_clf = MultinomialNB()
nb_clf.fit(X_train_features, y_train)
nb_clf_pred = nb_clf.predict(X_test_features)

In [11]:
# 8: Helper functions for evaluation
def print_metrics(name, y_true, y_pred, pos_label=0):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec = recall_score(y_true, y_pred, pos_label=pos_label)
    f1 = f1_score(y_true, y_pred, pos_label=pos_label)

    print(f"\n{name} Model Performance:")
    print(f"Accuracy: {acc:.2f}")
    print(f"Precision: {prec:.2f} (positive = spam = 0)")
    print(f"Recall: {rec:.2f} (positive = spam = 0)")
    print(f"F1 Score: {f1:.2f} (positive = spam = 0)")


def print_confusion_matrix(name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    cm_df = pd.DataFrame(
        cm,
        index=['Actual Ham (1)', 'Actual Spam (0)'],
        columns=['Predicted Ham (1)', 'Predicted Spam (0)']
    )
    print(f"\n{name} Confusion Matrix:\n{cm_df}")


In [12]:
# 9: Evaluate all models
models = {
    "Logistic Regression": log_reg_pred,
    "Random Forest": rf_clf_pred,
    "Naive Bayes": nb_clf_pred,
}

for name, pred in models.items():
    print_metrics(name, y_test, pred)
    print_confusion_matrix(name, y_test, pred)



Logistic Regression Model Performance:
Accuracy: 0.97
Precision: 1.00 (positive = spam = 0)
Recall: 0.76 (positive = spam = 0)
F1 Score: 0.86 (positive = spam = 0)

Logistic Regression Confusion Matrix:
                 Predicted Ham (1)  Predicted Spam (0)
Actual Ham (1)                 966                   0
Actual Spam (0)                 36                 113

Random Forest Model Performance:
Accuracy: 0.98
Precision: 1.00 (positive = spam = 0)
Recall: 0.86 (positive = spam = 0)
F1 Score: 0.92 (positive = spam = 0)

Random Forest Confusion Matrix:
                 Predicted Ham (1)  Predicted Spam (0)
Actual Ham (1)                 966                   0
Actual Spam (0)                 21                 128

Naive Bayes Model Performance:
Accuracy: 0.98
Precision: 1.00 (positive = spam = 0)
Recall: 0.83 (positive = spam = 0)
F1 Score: 0.90 (positive = spam = 0)

Naive Bayes Confusion Matrix:
                 Predicted Ham (1)  Predicted Spam (0)
Actual Ham (1)                 

In [15]:
# 10: Three sample sanity check
sample_indices = [1, 5, 14]  

def label2string(res):
    return 'Spam (0)' if res == 0 else 'Ham (1)'

for idx in sample_indices:
    sample_text = X_test.iloc[idx]
    true_label = y_test.iloc[idx]

    # Make predictions
    log_reg_pred = int(log_reg.predict(tfidf.transform([sample_text]))[0])
    rf_clf_pred = int(rf_clf.predict(tfidf.transform([sample_text]).toarray())[0])
    nb_clf_pred = int(nb_clf.predict(tfidf.transform([sample_text]))[0])

    # Print snippet and results
    snippet = (sample_text[:160] + '.....') if len(sample_text) > 160 else sample_text
    print(f"\n--- Sample Index {idx} ---")
    print(f"Text Snippet: {snippet}")
    print(f"Actual Label: {label2string(true_label)}")
    print(f"Logistic Regression Prediction: {label2string(log_reg_pred)}")
    print(f"Random Forest Prediction: {label2string(rf_clf_pred)}")
    print(f"Naive Bayes Prediction: {label2string(nb_clf_pred)}")



--- Sample Index 1 ---
Text Snippet: And also I've sorta blown him off a couple times recently so id rather not text him out of the blue looking for weed
Actual Label: Ham (1)
Logistic Regression Prediction: Ham (1)
Random Forest Prediction: Ham (1)
Naive Bayes Prediction: Ham (1)

--- Sample Index 5 ---
Text Snippet: Sary just need Tim in the bollox &it hurt him a lot so he tol me!
Actual Label: Ham (1)
Logistic Regression Prediction: Ham (1)
Random Forest Prediction: Ham (1)
Naive Bayes Prediction: Ham (1)

--- Sample Index 14 ---
Text Snippet: FREE RINGTONE text FIRST to 87131 for a poly or text GET to 87131 for a true tone! Help? 0845 2814032 16 after 1st free, tones are 3x£150pw to e£nd txt stop
Actual Label: Spam (0)
Logistic Regression Prediction: Spam (0)
Random Forest Prediction: Spam (0)
Naive Bayes Prediction: Spam (0)
