In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
RANDOM_STATE = 42  # reproducibility

In [4]:
# Lodad the dataset
df = pd.read_csv('mail_l7_dataset.csv')
print(df.info())

FileNotFoundError: [Errno 2] No such file or directory: 'mail_l7_dataset.csv'

In [None]:
# Basic cleaning: replace NaNs with empty strings (text models can't handle NaN)
df=df.where(pd.notnull(df), "") 

In [None]:
# Encode labels: spam -> 0, ham -> 1  (keep your original convention)
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"]=0
df.loc[df["Category"].str.lower().str.strip() == "ham", "Category"]=1
print(df.head())

  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [None]:
# Split features (X) and target (y)
X =df["Message"].astype(str)  # Ensure all entries are strings
y =df["Category"].astype(int)
print(X.dtype)   # should be object (strings)
print(y.dtype)   # should be int


object
int64


In [None]:
#  Train/test split (stratified)
X_train,X_test,y_train,y_test = train_test_split(
    X,y, test_size=0.2, random_state=RANDOM_STATE
)

print("=== SPLIT SIZES ===")
print("Train",X_train.shape[0], "| Test",X_test.shape[0])

=== SPLIT SIZES ===
Train 4457 | Test 1115


In [None]:
# Text → TF-IDF features
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features =tfidf.fit_transform(X_train)
X_test_features =tfidf.transform(X_test)
print("=== TF-IDF FEATURES ===")
print("X_train:", X_train_features.shape, " | X_test:", X_test_features.shape)

=== TF-IDF FEATURES ===
X_train: (4457, 7440)  | X_test: (1115, 7440)


In [None]:
# Train Logistic Regression (baseline)
lr=LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
y_pred_lr = lr.predict(X_test_features)

In [None]:
# Train Random Forest
#    (convert TF-IDF to dense for tree models)
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features, y_train)
y_pred_rf = rf.predict(X_test_features.toarray())

# native bayes model
nb = MultinomialNB()
nb.fit(X_train_features, y_train)
y_pred_nb = nb.predict(X_test_features)

In [None]:
# Helper functions: metrics + confusion matrix print
def print_merics(name,y_actual , y_pred, pos_label=0):
    acc=accuracy_score(y_actual, y_pred)
    prec=precision_score(y_actual, y_pred, pos_label=pos_label)
    rec=recall_score(y_actual, y_pred, pos_label=pos_label)
    f1=f1_score(y_actual, y_pred, pos_label=pos_label)
    print(f"\n{name} performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}")   
    print(f"  Recall   : {rec:.3f}")
    print(f"  F1-score : {f1:.3f}")

def print_confusion_matrix(name,y_actual, y_pred, labels=[0,1]):
    cm=confusion_matrix(y_actual, y_pred, labels=labels)
    cm_df=pd.DataFrame(
        cm,                   
        index=["Actual: Ham(1)", "Actual: Spam(0)"],
        columns=["Predicted: Ham(1)", "Predicted: Spam(0)"])
    print("\nConfusion Matrix:")
    print(f'\n{name} confusion matrix:\n', cm_df)

In [None]:
# Show results for both models (same print style as L5)
# logistic regression results
print_merics("Logistic Regression", y_test, y_pred_lr, pos_label=0)
print_confusion_matrix("Logistic Regression", y_test, y_pred_lr)
# random forest results
print_merics("Random Forest", y_test, y_pred_rf, pos_label=0)
print_confusion_matrix("Random Forest", y_test, y_pred_rf)





Logistic Regression performance:
  Accuracy : 0.968
  Precision: 1.000
  Recall   : 0.758
  F1-score : 0.863

Confusion Matrix:

Logistic Regression confusion matrix:
                  Predicted: Ham(1)  Predicted: Spam(0)
Actual: Ham(1)                 113                  36
Actual: Spam(0)                  0                 966

Random Forest performance:
  Accuracy : 0.983
  Precision: 1.000
  Recall   : 0.872
  F1-score : 0.932

Confusion Matrix:

Random Forest confusion matrix:
                  Predicted: Ham(1)  Predicted: Spam(0)
Actual: Ham(1)                 130                  19
Actual: Spam(0)                  0                 966


In [None]:
# native bayes results
print_merics("Naive Bayes", y_test, y_pred_nb, pos_label=0)
print_confusion_matrix("Naive Bayes", y_test, y_pred_nb)



Naive Bayes performance:
  Accuracy : 0.977
  Precision: 1.000
  Recall   : 0.826
  F1-score : 0.904

Confusion Matrix:

Naive Bayes confusion matrix:
                  Predicted: Ham(1)  Predicted: Spam(0)
Actual: Ham(1)                 123                  26
Actual: Spam(0)                  0                 966


In [None]:
# ) Single-message sanity check (like L5 single-row check)
i = 5# change index to inspect different emails from X_test
sample_text = X_test.iloc[i]
true_label  = y_test.iloc[i]

#  Predict with three models models
lr_pred_one=int(lr.predict(tfidf.transform([sample_text]))[0])
rf_pred_one=int(rf.predict(tfidf.transform([sample_text]).toarray())[0])
nb_pred_one=int(nb.predict(tfidf.transform([sample_text]))[0])

#  Predict with both models
def label_to_str(label):
    return "Ham(1)" if label==1 else "Spam(0)"


print("\n=== SINGLE MESSAGE SANITY CHECK ===")

snippet = (sample_text[:90] + '...') if len(sample_text) > 90 else sample_text
print(f"Message snippet: '{snippet}'")
print(f"Actual    : {label_to_str(true_label)}")
print(f"LR Pred   : {label_to_str(lr_pred_one)}")
print(f"RF Pred   : {label_to_str(rf_pred_one)}")
print(f"NB Pred   : {label_to_str(nb_pred_one)}")



=== SINGLE MESSAGE SANITY CHECK ===
Message snippet: 'Sary just need Tim in the bollox &it hurt him a lot so he tol me!'
Actual    : Ham(1)
LR Pred   : Ham(1)
RF Pred   : Ham(1)
NB Pred   : Ham(1)


In [None]:
print(f"Message snippet: '{snippet}'")
print(f"Actual    : {label_to_str(true_label)}")
print(f"LR Pred   : {label_to_str(lr_pred_one)}")
print(f"RF Pred   : {label_to_str(rf_pred_one)}")
print(f"NB Pred   : {label_to_str(nb_pred_one)}")

Message snippet: 'So there's a ring that comes with the guys costumes. It's there so they can gift their fut...'
Actual    : Ham(1)
LR Pred   : Ham(1)
RF Pred   : Ham(1)
NB Pred   : Ham(1)


In [None]:
print(f"Message snippet: '{snippet}'")
print(f"Actual    : {label_to_str(true_label)}")
print(f"LR Pred   : {label_to_str(lr_pred_one)}")
print(f"RF Pred   : {label_to_str(rf_pred_one)}")
print(f"NB Pred   : {label_to_str(nb_pred_one)}")

Message snippet: 'So there's a ring that comes with the guys costumes. It's there so they can gift their fut...'
Actual    : Ham(1)
LR Pred   : Ham(1)
RF Pred   : Ham(1)
NB Pred   : Ham(1)


In [None]:
# 1) Load the dataset
# 3)Train/test split (stratified)
# 2)Split features (X) and target (y)
# 4) Text → TF-IDF features
# 5)Train Logistic Regression (baseline)
# 6) Train Random Forest
# 7) Train naive_bayes
# 8) Helper functions: metrics + confusion matrix print
# 9) Show results for both models (same print style as L5)
# 10) Single-message sanity check (like L5 single-row check)