In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import(accuracy_score,precision_score,recall_score,f1_score,confusion_matrix)

In [12]:
df=pd.read_csv("mail_l7_dataset.csv")


In [13]:
df.loc[df["Category"].str.lower().str.strip()=="spam","Category"]=0
df.loc[df["Category"].str.lower().str.strip()=="ham","Category"]=1

In [14]:
X=df["Message"].astype(str)
y=df["Category"].astype(int)

In [15]:
print(df.head())

  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [16]:
x_train, x_test, y_train, y_test= train_test_split(
    X,y,test_size=0.2, random_state=42
)


In [17]:
tfidf=TfidfVectorizer(min_df=1,stop_words="english",lowercase=True)
x_train_features=tfidf.fit_transform(x_train)
x_test_features=tfidf.transform(x_test)

In [18]:
lr=LogisticRegression(max_iter=1000,random_state=42)
lr.fit(x_train_features,y_train)
lr.predict(x_test_features)
lr_predict=lr.predict(x_test_features)
print(lr_predict)

[1 1 1 ... 1 1 1]


In [19]:
def print_metrics(name, y_true,y_pred, pos_label=0):
    acc=accuracy_score(y_true,y_pred)
    prec=precision_score(y_true,y_pred,pos_label=pos_label)
    rec=recall_score(y_true,y_pred,pos_label=pos_label)
    f1=f1_score(y_true,y_pred,pos_label=pos_label)
    print(f"{name} performance:")
    print(f"Accuracy:{acc:.3f}(Postive=spam=0)")
    print(f"Precission:{prec:.3f}(Postive=spam=0)")
    print(f"ReCall:{rec:.3f}(Postive=spam=0)")
    print(f"F1-Score:{f1:.3f}(Postive=spam=0)")
print_metrics("Logistic Regression", y_test,lr_predict)

Logistic Regression performance:
Accuracy:0.968(Postive=spam=0)
Precission:1.000(Postive=spam=0)
ReCall:0.758(Postive=spam=0)
F1-Score:0.863(Postive=spam=0)


In [20]:
def print_confmat(name,y_true,y_pred):
    cm=confusion_matrix(y_true,y_pred,labels=[1,0])
    cm_df=pd.DataFrame(
        cm,
        index=["Actual Ham(1)","Actual Spam(0)"],
        columns=["Predicted Hame(1)","pred Spam(0)"]
    )
    print(f"{name} Confusion Matrix:\n{cm_df}") 
print_metrics("Logistic Regression", y_test,lr_predict)
print_confmat("LogisticRegression: ",y_test,lr_predict)

Logistic Regression performance:
Accuracy:0.968(Postive=spam=0)
Precission:1.000(Postive=spam=0)
ReCall:0.758(Postive=spam=0)
F1-Score:0.863(Postive=spam=0)
LogisticRegression:  Confusion Matrix:
                Predicted Hame(1)  pred Spam(0)
Actual Ham(1)                 966             0
Actual Spam(0)                 36           113


In [21]:
rf=RandomForestClassifier(n_estimators=200,random_state=42)
rf.fit(x_train_features,y_train)
rf.predict(x_test_features)
rf_predict=rf.predict(x_test_features)


In [22]:
def print_clf_metrics(name, y_true, y_pred, pos_label=0):
    """Print Accuracy, Precision, Recall, F1. pos_label=0 means 'spam' is positive."""
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec  = recall_score(y_true, y_pred, pos_label=pos_label)
    f1   = f1_score(y_true, y_pred, pos_label=pos_label)
    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}  (positive = spam=0)")
    print(f"  Recall   : {rec:.3f}  (positive = spam=0)")
    print(f"  F1-Score : {f1:.3f}  (positive = spam=0)")
print_clf_metrics("Random Forest", y_test, rf_predict, pos_label=0)


Random Forest Performance:
  Accuracy : 0.983
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.872  (positive = spam=0)
  F1-Score : 0.932  (positive = spam=0)


In [23]:
def print_confmat(name, y_true, y_pred):
    """
    Confusion matrix with readable labels.
    Rows = Actual, Cols = Predicted
    Order: [Ham(1), Spam(0)] so you can see both classes clearly.
    """
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    cm_df = pd.DataFrame(
        cm,
        index   = ["Actual: Ham (1)",  "Actual: Spam (0)"],
        columns = ["Pred: Ham (1)",    "Pred: Spam (0)"]
    )
    print(f"\n{name} – Confusion Matrix:\n{cm_df}")

In [34]:
print_clf_metrics("Random Forest", y_test, rf_predict, pos_label=0)
print_confmat("Random Forest", y_test, rf_predict,)


Random Forest Performance:
  Accuracy : 0.983
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.872  (positive = spam=0)
  F1-Score : 0.932  (positive = spam=0)

Random Forest – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             19             130


In [24]:
#=================================Naive Bayes
nb = MultinomialNB()
nb.fit(x_train_features, y_train)
nb_pred = nb.predict(x_test_features)

# Metrics function for Naive Bayes
def print_nb_metrics(name, y_true, y_pred, pos_label=0):
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec  = recall_score(y_true, y_pred, pos_label=pos_label)
    f1   = f1_score(y_true, y_pred, pos_label=pos_label)
    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}  (positive = spam=0)")
    print(f"  Recall   : {rec:.3f}  (positive = spam=0)")
    print(f"  F1-Score : {f1:.3f}  (positive = spam=0)")

# Confusion Matrix function for Naive Bayes
def print_nb_confmat(name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    cm_df = pd.DataFrame(
        cm,
        index   = ["Actual: Ham (1)", "Actual: Spam (0)"],
        columns = ["Pred: Ham (1)",   "Pred: Spam (0)"]
    )
    print(f"\n{name} – Confusion Matrix:\n{cm_df}")

# Print results
print_nb_metrics("Naive Bayes:", y_test, nb_pred)
print_nb_confmat("Naive Bayes:", y_test, nb_pred)


Naive Bayes: Performance:
  Accuracy : 0.977
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.826  (positive = spam=0)
  F1-Score : 0.904  (positive = spam=0)

Naive Bayes: – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             26             123


In [34]:
#// Sanity Check One Sample One
i=14
sample_text=x_test.iloc[i]
true_label=y_test.iloc[i]
lr_pred_one=int(lr.predict(tfidf.transform([sample_text]))[0])
rf_pred_one=int(rf.predict(tfidf.transform([sample_text]))[0])
nb_pred_one = int(nb.predict(tfidf.transform([sample_text]))[0])


def lab2str(r):
    return "spam(0)" if r==0 else "Ham (1)"
print("Sanity Check")
print("Sample Text: ",sample_text)
print("Actual: ",lab2str(true_label))
print("LR Predict: ",lab2str(lr_pred_one))
print("RF Predict: ",lab2str(rf_pred_one))
print("NB Predict: ",lab2str(nb_pred_one))

Sanity Check
Sample Text:  FREE RINGTONE text FIRST to 87131 for a poly or text GET to 87131 for a true tone! Help? 0845 2814032 16 after 1st free, tones are 3x£150pw to e£nd txt stop
Actual:  spam(0)
LR Predict:  spam(0)
RF Predict:  spam(0)
NB Predict:  spam(0)


In [31]:
#// Sanity Check One Sample Two
i=18
sample_text=x_test.iloc[i]
true_label=y_test.iloc[i]
lr_pred_one=int(lr.predict(tfidf.transform([sample_text]))[0])
rf_pred_one=int(rf.predict(tfidf.transform([sample_text]))[0])
nb_pred_one = int(nb.predict(tfidf.transform([sample_text]))[0])


def lab2str(r):
    return "spam(0)" if r==0 else "Ham (1)"
print("Sanity Check")
print("Sample Text: ",sample_text)
print("Actual: ",lab2str(true_label))
print("LR Predict: ",lab2str(lr_pred_one))
print("RF Predict: ",lab2str(rf_pred_one))
print("NB Predict: ",lab2str(nb_pred_one))

Sanity Check
Sample Text:  Fighting with the world is easy, u either win or lose bt fightng with some1 who is close to u is dificult if u lose - u lose if u win - u still lose.
Actual:  Ham (1)
LR Predict:  Ham (1)
RF Predict:  Ham (1)
NB Predict:  Ham (1)


In [32]:
#// Sanity Check One Sample Three
i=4
sample_text=x_test.iloc[i]
true_label=y_test.iloc[i]
lr_pred_one=int(lr.predict(tfidf.transform([sample_text]))[0])
rf_pred_one=int(rf.predict(tfidf.transform([sample_text]))[0])
nb_pred_one = int(nb.predict(tfidf.transform([sample_text]))[0])


def lab2str(r):
    return "spam(0)" if r==0 else "Ham (1)"
print("Sanity Check")
print("Sample Text: ",sample_text)
print("Actual: ",lab2str(true_label))
print("LR Predict: ",lab2str(lr_pred_one))
print("RF Predict: ",lab2str(rf_pred_one))
print("NB Predict: ",lab2str(nb_pred_one))

Sanity Check
Sample Text:  So there's a ring that comes with the guys costumes. It's there so they can gift their future yowifes. Hint hint
Actual:  Ham (1)
LR Predict:  Ham (1)
RF Predict:  Ham (1)
NB Predict:  Ham (1)
