In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix)
from sklearn.naive_bayes import MultinomialNB
RANDOM_STATE = 42 


#load the dataset
df = pd.read_csv("mail_l7_dataset.csv")

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df = df.where(pd.notnull(df), "")


In [6]:
#3encoding ham=1 spam=0;
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

print(df.head())

  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [10]:
 #4 Split features (X) and target (y)
X = df["Message"].astype(str)
y = df["Category"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print(" the size ")
print("Train:", X_train.shape[0], " | Test:", X_test.shape[0])


 the size 
Train: 4457  | Test: 1115


In [12]:

tfidf = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_test_features = tfidf.fit_transform(X_test)
X_train_features = tfidf.transform(X_train)

print("X_train:", X_train_features.shape) 
print("X_test:", X_test_features.shape)

X_train: (4457, 3322)
X_test: (1115, 3322)


In [24]:
#Train Logistic Regression,random forest,MultinomialNB

lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)

rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features, y_train)
rf_pred = rf.predict(X_test_features.toarray())

nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_pred = nb.predict(X_test_features)

print(nb_pred)


[1 1 1 ... 1 1 1]


In [26]:

def print_clf_metrics(name, y_true, y_pred, pos_label=0):
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec  = recall_score(y_true, y_pred, pos_label=pos_label)
    f1   = f1_score(y_true, y_pred, pos_label=pos_label)
    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}  (positive = spam=0)")
    print(f"  Recall   : {rec:.3f}  (positive = spam=0)")
    print(f"  F1-Score : {f1:.3f}  (positive = spam=0)")

print_clf_metrics("Logistic Regression", y_test, lr_pred)
print_clf_metrics("Random Forest", y_test, rf_pred)
print_clf_metrics("Naive Bayes", y_test, nb_pred)



Logistic Regression Performance:
  Accuracy : 0.956
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.671  (positive = spam=0)
  F1-Score : 0.803  (positive = spam=0)

Random Forest Performance:
  Accuracy : 0.984
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.879  (positive = spam=0)
  F1-Score : 0.936  (positive = spam=0)

Naive Bayes Performance:
  Accuracy : 0.986
  Precision: 0.978  (positive = spam=0)
  Recall   : 0.913  (positive = spam=0)
  F1-Score : 0.944  (positive = spam=0)


In [27]:
def print_confmat (name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    cm_df = pd.DataFrame(cm,
        index   = ["Actual: Ham (1)",  "Actual: Spam (0)"],
        columns = ["Pred: Ham (1)",    "Pred: Spam (0)"]
    )
    print(f"\n{name} – Confusion Matrix:\n{cm_df}")

print_confmat("Logistic Regression", y_test, lr_pred)
print_confmat("Random Forest", y_test, rf_pred)
print_confmat("Naive Bayes", y_test, nb_pred)
    


Logistic Regression – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             100              49
Actual: Spam (0)              0             966

Random Forest – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             131              18
Actual: Spam (0)              0             966

Naive Bayes – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             136              13
Actual: Spam (0)              3             963


In [30]:

sample_index = 14

sample_text = X_test.iloc[sample_index]
true_label  = y_test.iloc[sample_index]

lr_pred_one = int(lr.predict(tfidf.transform([sample_text]))[0])
rf_pred_one = int(rf.predict(tfidf.transform([sample_text]).toarray())[0])
nb_pred_one = int(nb.predict(tfidf.transform([sample_text]))[0])

def lab2str(v):  
    return "Spam (0)" if v == 0 else "Ham (1)"

snippet = (sample_text[:160] + "...") if len(sample_text) > 160 else sample_text

print("Text snippet:", snippet)
print("Actual      :", lab2str(true_label))
print("LR Pred     :", lab2str(lr_pred_one))
print("RF Pred     :", lab2str(rf_pred_one))
print("NB Pred     :", lab2str(nb_pred_one))


Text snippet: FREE RINGTONE text FIRST to 87131 for a poly or text GET to 87131 for a true tone! Help? 0845 2814032 16 after 1st free, tones are 3x£150pw to e£nd txt stop
Actual      : Spam (0)
LR Pred     : Spam (0)
RF Pred     : Spam (0)
NB Pred     : Spam (0)
