In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix)
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
RANDOM_STATE = 42 

In [3]:
df = pd.read_csv("mail_l7_dataset.csv")
print(df.head(10))


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...


In [9]:
# handle missing value and encoding the label 
df["Category"] = df["Category"].fillna("").astype(str).str.lower().str.strip()

df["Category"] = df["Category"].astype(str).str.lower().str.strip()
df["Category"] = df["Category"].replace({"spam": 0, "ham": 1})
print(df.head())


  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [55]:

X = df["Message"].astype(str)   
y = df["Category"].astype(int) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print( "the X train data is: ",X_train.shape)
print("the X test data is: ",X_test.shape)
print("the Y train data is: ",y_train.shape)
print("the Y test data is: ",y_test.shape)

the X train data is:  (4457,)
the X test data is:  (1115,)
the Y train data is:  (4457,)
the Y test data is:  (1115,)


In [67]:
tfidf = TfidfVectorizer(min_df = 1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features = tfidf.transform(X_test)

lr =  LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_features, y_train)
lr_predict = lr.predict(X_test_features)
print(lr_predict)

rf =  RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features, y_train)
rf_predict = rf.predict(X_test_features.toarray())
print(rf_predict)

nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_predict = nb.predict(X_test_features)
print(nb_predict)

[1 1 1 ... 1 1 1]
[1 1 1 ... 1 1 1]
[1 1 1 ... 1 1 1]


In [97]:
def  print_metrics(name, y_true, y_predict, pos_label=0):
    acc = accuracy_score (y_true, y_predict)
    prec = precision_score(y_true, y_predict , pos_label=pos_label)
    rec = recall_score(y_true, y_predict, pos_label = pos_label)
    f1 = f1_score(y_true, y_predict, pos_label = pos_label)
    print(f" Performance {name}")
    print(f"accurancy : {acc:.2f}")
    print(f"precision : {prec:.2f} (Positive = Spam = 0)")
    print(f"recall : {rec:.2f} (positive=spam=0)")
    print(f"F1 score : {f1:.2f} (positive=spam=0)")
    
def print_confmat(name, y_true, y_predict):

    cm = confusion_matrix(y_true, y_predict, labels=[1, 0])
    cm_df = pd.DataFrame(
        cm,
        index   = ["Actual: Ham (1)",  "Actual: Spam (0)"],
        columns = ["Pred: Ham (1)",    "Pred: Spam (0)"]
    )
    print(f"Confusion Matrix:\n{cm_df} \n")
    
print_metrics("Logistic Regression : ", y_test, lr_predict)
print_confmat("Logistic Regression", y_test, lr_predict )

print_metrics("Random Forest Classifier : ", y_test, rf_predict)
print_confmat("Random Forest", y_test, rf_predict)

print_metrics("Naive Bayes(MultinomialNB) classifier :", y_test, nb_predict)
print_confmat("Naive Bayes(MultinomialNB) classifier", y_test, nb_predict)
    

 Performance Logistic Regression : 
accurancy : 0.97
precision : 1.00 (Positive = Spam = 0)
recall : 0.76 (positive=spam=0)
F1 score : 0.86 (positive=spam=0)
Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             36             113 

 Performance Random Forest Classifier : 
accurancy : 0.98
precision : 1.00 (Positive = Spam = 0)
recall : 0.87 (positive=spam=0)
F1 score : 0.93 (positive=spam=0)
Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             19             130 

 Performance Naive Bayes(MultinomialNB) classifier :
accurancy : 0.98
precision : 1.00 (Positive = Spam = 0)
recall : 0.83 (positive=spam=0)
F1 score : 0.90 (positive=spam=0)
Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             26             123 



In [107]:

indices = [5, 14, 27]   

def lab2str(v): 
    return "Spam (0)" if v == 0 else "Ham (1)"

for i in indices:
    sample_text = X_test.iloc[i]
    true_label  = y_test.iloc[i]

    # Predictions from 3 models
    lr_pred_one = int(lr.predict(tfidf.transform([sample_text]))[0])
    rf_pred_one = int(rf.predict(tfidf.transform([sample_text]).toarray())[0])
    nb_pred_one = int(nb.predict(tfidf.transform([sample_text]))[0])

    print("\n=== SINGLE MESSAGE CHECK (index =", i, ") ===")
    snippet = (sample_text[:160] + "...") if len(sample_text) > 160 else sample_text
    print("Text snippet:", snippet)
    print("Actual      :", lab2str(true_label))
    print("LR Pred     :", lab2str(lr_pred_one))
    print("RF Pred     :", lab2str(rf_pred_one))
    print("NB Pred     :", lab2str(nb_pred_one))




=== SINGLE MESSAGE CHECK (index = 5 ) ===
Text snippet: Sary just need Tim in the bollox &it hurt him a lot so he tol me!
Actual      : Ham (1)
LR Pred     : Ham (1)
RF Pred     : Ham (1)
NB Pred     : Ham (1)

=== SINGLE MESSAGE CHECK (index = 14 ) ===
Text snippet: FREE RINGTONE text FIRST to 87131 for a poly or text GET to 87131 for a true tone! Help? 0845 2814032 16 after 1st free, tones are 3x£150pw to e£nd txt stop
Actual      : Spam (0)
LR Pred     : Spam (0)
RF Pred     : Spam (0)
NB Pred     : Spam (0)

=== SINGLE MESSAGE CHECK (index = 27 ) ===
Text snippet: Gud gud..k, chikku tke care.. sleep well gud nyt
Actual      : Ham (1)
LR Pred     : Ham (1)
RF Pred     : Ham (1)
NB Pred     : Ham (1)
