In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
from sklearn.naive_bayes import MultinomialNB

In [2]:

df = pd.read_csv('mail_l7_dataset.csv')

In [3]:
df.loc[df["Category"].str.lower().str.strip() =='spam',"Category"] =0
df.loc[df["Category"].str.lower().str.strip() =='ham',"Category"] =1

In [4]:
X = df["Message"].astype(str)
y = df["Category"].astype(int)

In [5]:
X_train,X_test,y_train,y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
lfidf = TfidfVectorizer(min_df=1,stop_words="english",lowercase=True)

In [7]:
X_train_features = lfidf.fit_transform(X_train)
X_test_features = lfidf.transform(X_test)

In [8]:
lr = LogisticRegression(random_state=42,max_iter=1000)
rf = RandomForestClassifier(random_state=42)
nb = MultinomialNB()

In [9]:
lr.fit(X_train_features,y_train)
rf.fit(X_train_features,y_train)
nb.fit(X_train_features,y_train)

In [10]:
lr_predict = lr.predict(X_test_features)
rf_predict = rf.predict(X_test_features)
nb_predict = nb.predict(X_test_features)

In [39]:
def print_metrics(name,y_true,y_pred,pos_label=0):
    acc = accuracy_score(y_true,y_pred)
    prec = precision_score(y_true,y_pred,pos_label=pos_label)
    rec = recall_score(y_true,y_pred,pos_label=pos_label)
    f1 = f1_score(y_true,y_pred,pos_label=pos_label)
    print(f"{name } performentce")
    print(f"Accurance: {acc:.3f}")
    print(f"Precision: {prec:.3f} (positive = spam = {pos_label})")
    print(f"Recall: {rec:.3f} (positive = spam = {pos_label})")
    print(f"F-Score: {f1:.3f} (positive = spam = {pos_label})")
    print("============================================")

In [40]:
print_metrics("Logistic Regression",y_test,lr_predict)
print_metrics("Random Forest ",y_test,rf_predict)
print_metrics("Naive Bayes ",y_test,nb_predict)
print("============================================")
confMat("Logistic Regression",y_test,lr_predict)
print("============================================")
confMat("Random Forest ",y_test,rf_predict)
print("============================================")
confMat("Naive Bayes ",y_test,nb_predict)

Logistic Regression performentce
Accurance: 0.968
Precision: 1.000 (positive = spam = 0)
Recall: 0.758 (positive = spam = 0)
F-Score: 0.863 (positive = spam = 0)
Random Forest  performentce
Accurance: 0.981
Precision: 1.000 (positive = spam = 0)
Recall: 0.859 (positive = spam = 0)
F-Score: 0.924 (positive = spam = 0)
Naive Bayes  performentce
Accurance: 0.977
Precision: 1.000 (positive = spam = 0)
Recall: 0.826 (positive = spam = 0)
F-Score: 0.904 (positive = spam = 0)
Logistic Regression - Confission Matrixs:
                  Pred Ham (1)  Pred Spam(0)
Accual Ham (1)            966             0
Accual Spam (0)            36           113
Random Forest  - Confission Matrixs:
                  Pred Ham (1)  Pred Spam(0)
Accual Ham (1)            966             0
Accual Spam (0)            21           128
Naive Bayes  - Confission Matrixs:
                  Pred Ham (1)  Pred Spam(0)
Accual Ham (1)            966             0
Accual Spam (0)            26           123


In [12]:
def confMat(name,y_true,y_pred):
    cm = confusion_matrix(y_true,y_pred,labels=[1,0])
    cm_df = pd.DataFrame(
        cm,
        index=["Accual Ham (1)","Accual Spam (0)"],
        columns=["Pred Ham (1)","Pred Spam(0)"]
    )
    print(f"{name} - Confission Matrixs:\n {cm_df}")

In [25]:
i = 16
# sample_text =X_test.iloc[i]
sample_text="Congratulations! You’ve won a $1,000 gift card please click this link and get your money"
# true_label = y_test.iloc[i]
true_label =0

In [26]:
lr_pred_one = int(lr.predict(lfidf.transform([sample_text]))[0])
rf_pred_one = int(rf.predict(lfidf.transform([sample_text]))[0])
nb_pred_one = int(nb.predict(lfidf.transform([sample_text]))[0])

In [27]:
def label2str(r):
    return "Spam (0)" if r ==0 else "Ham (1)"

In [28]:
print("Sanity Check")
print("Sample text: "+sample_text)
print(f"Accual: {label2str(true_label)}")
print(f"LR Predict: {label2str(lr_pred_one)}")
print(f"RF Predict: {label2str(rf_pred_one)}")
print(f"NB Predict: {label2str(nb_pred_one)}")

Sanity Check
Sample text: Congratulations! You’ve won a $1,000 gift card please click this link and get your money
Accual: Spam (0)
LR Predict: Ham (1)
RF Predict: Spam (0)
NB Predict: Ham (1)
