In [516]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix


In [517]:
# dataset load
df = pd.read_csv("mail_l7_dataset.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [518]:
# 2. Label Encoding
df['Category'] = df['Category'].str.lower().str.strip().map({"spam": 0, "ham": 1})  # ham rows  = 4,825.  spam rows = 747.
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [519]:
# 3. seperate features and labels
x = df["Message"].astype(str) 
y = df["Category"].astype(int)

In [520]:
# 4. split the dataset  into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=42)
print("=== SPLIT SIZES ===")
print("Train Data:", x_train.shape[0], " | Test Data:", x_test.shape[0])

=== SPLIT SIZES ===
Train Data: 4457  | Test Data: 1115


In [521]:
# 5. TF-IDF features
vectorizer = TfidfVectorizer(min_df=1, stop_words="english", lowercase= True)
x_train_Features = vectorizer.fit_transform(x_train)
x_test_Features = vectorizer.transform(x_test) 
print("=== TF-IDF SHAPES ===")
print("X_train:", x_train_Features.shape, " | X_test:", x_test_Features.shape)

=== TF-IDF SHAPES ===
X_train: (4457, 7440)  | X_test: (1115, 7440)


In [522]:
# 6. Train the model by using LogisticRegression
lr = LogisticRegression(max_iter=1000, random_state=42)
# fit the model
lr.fit(x_train_Features,y_train)
# pridect the model
lr_pred = lr.predict(x_test_Features)
# convert the predictions of the LogisticRegression model to a DataFrame
lr_pred_df = pd.DataFrame(lr_pred, columns=["LR prediction"])
# lr_pred_df.head(30)
lr_pred_df.value_counts()

LR prediction
1                1002
0                 113
Name: count, dtype: int64

In [523]:
# 6. Train the model by using a RandomForest Classifier
rf = RandomForestClassifier(n_estimators=200, random_state=42)
# fit the model
rf.fit(x_train_Features,y_train)
# pridect the model
rf_pred = rf.predict(x_test_Features)
# convert the predictions of the RandomForest Classifier model to a DataFrame
rf_pred_df = pd.DataFrame(rf_pred, columns=["RF prediction"])
# rf_pred_df.head(30)
rf_pred_df.value_counts()


RF prediction
1                985
0                130
Name: count, dtype: int64

In [524]:
# 7. Train the model by using Naive Bayes
nb = MultinomialNB()
# fit the model
nb.fit(x_train_Features,y_train)
# pridect the model
nb_pred = nb.predict(x_test_Features)
# convert the predictions of the Naive Bayes model to a DataFrame
nb_pred_df = pd.DataFrame(nb_pred, columns=["NB prediction"])
# nb_pred_df.head(30)
nb_pred_df.value_counts()



NB prediction
1                992
0                123
Name: count, dtype: int64

In [525]:
# 8. Evaluating the perforance of the models by using Metrics
def display_metrics(model_name,y_actual,y_pred):
    # Calculate all metrics, ensuring pos_label=0 for Precision, Recall, and F1
    accuracy = accuracy_score(y_actual,y_pred)
    precision = precision_score(y_actual,y_pred,pos_label=0)
    recall = recall_score(y_actual,y_pred,pos_label=0)
    f1 = f1_score(y_actual,y_pred,pos_label=0)
    # Dispalying the result
    print(f"\n{model_name} Performance: ")
    print(f"Accuracy (Overall): {accuracy:.2f}")
    print(f"Precision (Spam=0): {precision:.2f}")
    print(f"Recall (Spam=0):    {recall:.2f}")
    print(f"F1-Score (Spam=0):  {f1:.2f}")

# Dispalying the Confusin Matrix 
def show_confusion_matrix(model_name,y_actual,y_pred):
    cm = confusion_matrix(y_actual,y_pred,labels=[1,0])
    print(f"\n{model_name} Confusion matrix: ")
    cm_df = pd.DataFrame(cm,
                         index=["Actual Ham(1)", "Actual Spam(0)"],
                         columns=["Pred Ham(1)", "Pred Spam(0)"]
                         )
    print(cm_df)

# print the evaluation metrics for the models
display_metrics("Logistic Regression",y_test,lr_pred)
show_confusion_matrix("Logistic Regression",y_test,lr_pred)

display_metrics("Random Forest",y_test,rf_pred)
show_confusion_matrix("Random Forest",y_test,rf_pred)

display_metrics("Naive Bayes",y_test,nb_pred)
show_confusion_matrix("Naive Bayes",y_test,nb_pred)




Logistic Regression Performance: 
Accuracy (Overall): 0.97
Precision (Spam=0): 1.00
Recall (Spam=0):    0.76
F1-Score (Spam=0):  0.86

Logistic Regression Confusion matrix: 
                Pred Ham(1)  Pred Spam(0)
Actual Ham(1)           966             0
Actual Spam(0)           36           113

Random Forest Performance: 
Accuracy (Overall): 0.98
Precision (Spam=0): 1.00
Recall (Spam=0):    0.87
F1-Score (Spam=0):  0.93

Random Forest Confusion matrix: 
                Pred Ham(1)  Pred Spam(0)
Actual Ham(1)           966             0
Actual Spam(0)           19           130

Naive Bayes Performance: 
Accuracy (Overall): 0.98
Precision (Spam=0): 1.00
Recall (Spam=0):    0.83
F1-Score (Spam=0):  0.90

Naive Bayes Confusion matrix: 
                Pred Ham(1)  Pred Spam(0)
Actual Ham(1)           966             0
Actual Spam(0)           26           123


In [526]:
# 9. sanity check (for the test data)
# this function will change the label output to a string like 1-> ham and 0-> spam
def label_to_str(r):
 return "Spm(0)" if r == 0 else "Ham(1)"

i = 905
while i < 910:
 sample_test =  x_test.iloc[i]
 Actual_Label = y_test.iloc[i]

 lr_pred_test = int(lr.predict(vectorizer.transform([sample_test]))[0])
 rf_pred_test = int(rf.predict(vectorizer.transform([sample_test]))[0])
 nb_pred_test = int(nb.predict(vectorizer.transform([sample_test]))[0])

 print(f"Actual:{label_to_str(Actual_Label)} |LR pred:{label_to_str(lr_pred_test)} |RF pred:{label_to_str(rf_pred_test)} |NB pred:{label_to_str(nb_pred_test)}")
 i+=1



Actual:Ham(1) |LR pred:Ham(1) |RF pred:Ham(1) |NB pred:Ham(1)
Actual:Spm(0) |LR pred:Ham(1) |RF pred:Spm(0) |NB pred:Ham(1)
Actual:Spm(0) |LR pred:Spm(0) |RF pred:Spm(0) |NB pred:Spm(0)
Actual:Spm(0) |LR pred:Ham(1) |RF pred:Spm(0) |NB pred:Spm(0)
Actual:Ham(1) |LR pred:Ham(1) |RF pred:Ham(1) |NB pred:Ham(1)


In [527]:
# 10. sanity check (for the Sample Message)
sample_message = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket",
]

# this function will change the label output to string like: 1-> ham and 0-> spam
def label_to_str(r):
 return "Spam (0)" if r == 0 else "Ham (1)"


print("\n=== SAMPLE MESSAGE PREDICTIONS ===")
for i in sample_message:
 lr_pred_sample = int(lr.predict(vectorizer.transform([i]))[0])
 rf_pred_sample = int(rf.predict(vectorizer.transform([i]))[0])
 nb_pred_sample = int(nb.predict(vectorizer.transform([i]))[0])
 #
 print(f"\nSample Message: {i}")
 print(f"LR pred: {label_to_str(lr_pred_sample)}") 
 print(f"RF pred: {label_to_str(rf_pred_sample)}")
 print(f"NB pred: {label_to_str(nb_pred_sample)}\n")
 


=== SAMPLE MESSAGE PREDICTIONS ===

Sample Message: Free entry in 2 a weekly competition!
LR pred: Ham (1)
RF pred: Ham (1)
NB pred: Spam (0)


Sample Message: I will meet you at the cafe tomorrow
LR pred: Ham (1)
RF pred: Ham (1)
NB pred: Ham (1)


Sample Message: Congratulations, you won a free ticket
LR pred: Ham (1)
RF pred: Ham (1)
NB pred: Ham (1)

