In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix)


In [55]:
# Data Load
df = pd.read_csv("mail_l7_dataset.csv")
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [56]:
# encode labels spam -> 0, ham -> 1
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1
print(df.head(5))

  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [57]:
#  label and feature or x and y 
X = df["Message"].astype(str)
y = df["Category"].astype(int)

In [58]:
#  train test split
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42 )
print("=== SPLIT SIZES ===")
print("Train:", X_train.shape[0], " | Test:", X_test.shape[0])

=== SPLIT SIZES ===
Train: 4457  | Test: 1115


In [59]:
# tdf telling the spam words 
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)
print("\n=== TF-IDF SHAPES ===")
print("X_train:", X_train_features.shape, " | X_test:", X_test_features.shape)


=== TF-IDF SHAPES ===
X_train: (4457, 7440)  | X_test: (1115, 7440)


In [60]:

# model training logistic regression
lr =  LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_features, y_train)
lr_predict = lr.predict(X_test_features)
print(lr_predict)

[1 1 1 ... 1 1 1]


In [62]:
#  random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_features, y_train)
rf_predict = rf.predict(X_test_features)
print(rf_predict)

[1 1 1 ... 1 1 1]


In [65]:
nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_predict = nb.predict(X_test_features)
print(nb_predict)

[1 1 1 ... 1 1 1]


In [67]:
# metrics and confusion matrix
def print_metrics(name, y_true, y_pred, pos_label=0):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec = recall_score(y_true, y_pred , pos_label=pos_label)
    f1 = f1_score(y_true, y_pred)
    print(f"{name} performance:")
    print(f"  Accuracy:  {acc:.3f}")
    print(f"  Precision: {prec:.3f}")
    print(f"  Recall:    {rec:.3f}")
    print(f"  F1-Score:  {f1:.3f}")
print_metrics("Logistic Regression", y_test, lr_predict)
print_metrics("Random Forest", y_test, rf_predict)
print_metrics("Naive Bayes", y_test, nb_predict)



Logistic Regression performance:
  Accuracy:  0.968
  Precision: 1.000
  Recall:    0.758
  F1-Score:  0.982
Random Forest performance:
  Accuracy:  0.981
  Precision: 1.000
  Recall:    0.859
  F1-Score:  0.989
Naive Bayes performance:
  Accuracy:  0.977
  Precision: 1.000
  Recall:    0.826
  F1-Score:  0.987


In [68]:
# confusion matrix
def print_confmat(name,y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    cm_df = pd.DataFrame(
        cm,
        index=["Actual Ham (1)", "Actual Spam (0)"],
        columns=["Predicted Ham (1)", "Predicted Spam (0)"]
    )
    
    print(f"\n{name} Confusion Matrix:\n{cm_df}")
    

print_metrics("Logistic Regression", y_test, lr_predict)
print_confmat("Logistic Regression", y_test, lr_predict)

print_metrics("Random Forest", y_test, rf_predict)
print_confmat("Random Forest", y_test, rf_predict)

print_metrics("Naive Bayes", y_test, nb_predict)
print_confmat("Naive Bayes", y_test, nb_predict)


Logistic Regression performance:
  Accuracy:  0.968
  Precision: 1.000
  Recall:    0.758
  F1-Score:  0.982

Logistic Regression Confusion Matrix:
                 Predicted Ham (1)  Predicted Spam (0)
Actual Ham (1)                 966                   0
Actual Spam (0)                 36                 113
Random Forest performance:
  Accuracy:  0.981
  Precision: 1.000
  Recall:    0.859
  F1-Score:  0.989

Random Forest Confusion Matrix:
                 Predicted Ham (1)  Predicted Spam (0)
Actual Ham (1)                 966                   0
Actual Spam (0)                 21                 128
Naive Bayes performance:
  Accuracy:  0.977
  Precision: 1.000
  Recall:    0.826
  F1-Score:  0.987

Naive Bayes Confusion Matrix:
                 Predicted Ham (1)  Predicted Spam (0)
Actual Ham (1)                 966                   0
Actual Spam (0)                 26                 123


In [69]:
# sanity check
i = 23
sample_text = X_test.iloc[i]
true_label = y_test.iloc[i]

lr_pred_one = int(lr.predict(tfidf.transform([sample_text]))[0])
rf_pred_one = int(rf.predict(tfidf.transform([sample_text]))[0])
nb_pred_one = int(nb.predict(tfidf.transform([sample_text]))[0])
def label_to_str(label):
    return "Ham (1)" if label == 1 else "Spam (0)"
print("\n=== SANITY CHECK ===")
print("Sample text:", sample_text)
print("Actual label:", label_to_str(true_label))
print("LR predicted:", label_to_str(lr_pred_one))
print("RF predicted:", label_to_str(rf_pred_one))
print("NB predicted:", label_to_str(nb_pred_one))



=== SANITY CHECK ===
Sample text: I guess that's why you re worried. You must know that there's a way the body repairs itself. And i'm quite sure you shouldn't worry. We'll take it slow. First the tests, they will guide when your ovulation is then just relax. Nothing you've said is a reason to worry but i.ll keep on followin you up.
Actual label: Ham (1)
LR predicted: Ham (1)
RF predicted: Ham (1)
NB predicted: Ham (1)


In [70]:
# -------------------------------
# SANITY CHECKS: 3 SINGLE MESSAGES
# -------------------------------

test_messages = [
    "Free entry in 2 a weekly competition!",  # Likely spam
    "I will meet you at the cafe tomorrow",  # Likely ham
    "Congratulations, you won a free ticket" # Likely spam
]

print("\n=== SANITY CHECKS: 3 MESSAGES ===")
for msg in test_messages:
    lr_pred = int(lr.predict(tfidf.transform([msg]))[0])
    rf_pred = int(rf.predict(tfidf.transform([msg]))[0])
    nb_pred = int(nb.predict(tfidf.transform([msg]))[0])
    
    print(f"\nMessage: {msg}")
    print(f"  Logistic Regression Prediction: {label_to_str(lr_pred)}")
    print(f"  Random Forest Prediction:       {label_to_str(rf_pred)}")
    print(f"  Naive Bayes Prediction:         {label_to_str(nb_pred)}")



=== SANITY CHECKS: 3 MESSAGES ===

Message: Free entry in 2 a weekly competition!
  Logistic Regression Prediction: Ham (1)
  Random Forest Prediction:       Ham (1)
  Naive Bayes Prediction:         Spam (0)

Message: I will meet you at the cafe tomorrow
  Logistic Regression Prediction: Ham (1)
  Random Forest Prediction:       Ham (1)
  Naive Bayes Prediction:         Ham (1)

Message: Congratulations, you won a free ticket
  Logistic Regression Prediction: Ham (1)
  Random Forest Prediction:       Ham (1)
  Naive Bayes Prediction:         Ham (1)
