In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)


In [2]:
df = pd.read_csv("mail_l7_dataset.csv")
# print(df.head())
# print(df.shape)
# print(df.isnull().sum())

# Basic cleaning: replace NaNs with empty strings (text models can't handle NaN)
df = df.fillna('')
# print(df.isnull().sum())

# print(df['Category'].value_counts())

In [3]:
# Encode labels: spam -> 0, ham -> 1  (keep your original convention)
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

df['Category'] = df['Category'].astype(int)
# print(df['Category'].value_counts())

# Split features (X) and target (y)
X = df['Message'].astype(str)
y = df['Category'].astype(int)
# print(X.shape, y.shape)

In [4]:
# Train/test split (stratified)
x_train, x_test,y_train,y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [5]:
# Text → TF-IDF features
tfidf = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

# print(x_train_tfidf.shape, x_test_tfidf.shape)

In [6]:
## Train Logistic Regression (baseline)
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(x_train_tfidf, y_train)

logreg_y_pred = logreg.predict(x_test_tfidf)

# Train Random Forest (advanced)
ranfor = RandomForestClassifier(n_estimators=100, random_state=42)
ranfor.fit(x_train_tfidf, y_train)

ranfor_y_pred = ranfor.predict(x_test_tfidf)

#Train Naive Bayes (advanced)
naBay = MultinomialNB()
naBay.fit(x_train_tfidf, y_train)

naBay_y_pred = naBay.predict(x_test_tfidf)

In [7]:
# helper function for evaluation metrics
def print_evaluation_metrics(name,y_true, y_pred,positive_label = 0):
    print(f"Evaluation metrics for {name}:")
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=positive_label)
    rec = recall_score(y_true, y_pred, pos_label=positive_label)
    f1 = f1_score(y_true, y_pred, pos_label=positive_label)
    cd = confusion_matrix(y_true, y_pred, labels=[1, 0])

    class Result:
        def __init__(self, acc, prec, rec, f1, cd):
            self.acc = acc
            self.prec = prec
            self.rec = rec
            self.f1 = f1
            self.cd = cd
    result = Result(acc, prec, rec, f1, cd)
    print("Accuracy:", f"{result.acc:.2f}")
    print("Precision:", f"{result.prec:.2f}")
    print("Recall:", f"{result.rec:.2f}")
    print("F1 Score:", f"{result.f1:.2f}")
    cd = pd.DataFrame(cd, index=['Actual Ham', 'Actual Spam'], columns=['Predicted Ham', 'Predicted Spam'])

    print("Confusion Matrix:\n", cd)
    return result
    
# print evaluation metrics for Logistic Regression
lr = print_evaluation_metrics("Logistic Regression", y_test, logreg_y_pred, positive_label=0)
# print evaluation metrics for Random Forest
rl = print_evaluation_metrics("Random Forest", y_test, ranfor_y_pred, positive_label=0)
# print evaluation metrics for Naive Bayes
nb = print_evaluation_metrics("Naive Bayes", y_test, naBay_y_pred, positive_label=0)

Evaluation metrics for Logistic Regression:
Accuracy: 0.97
Precision: 1.00
Recall: 0.76
F1 Score: 0.86
Confusion Matrix:
              Predicted Ham  Predicted Spam
Actual Ham             966               0
Actual Spam             36             113
Evaluation metrics for Random Forest:
Accuracy: 0.98
Precision: 1.00
Recall: 0.86
F1 Score: 0.92
Confusion Matrix:
              Predicted Ham  Predicted Spam
Actual Ham             966               0
Actual Spam             21             128
Evaluation metrics for Naive Bayes:
Accuracy: 0.98
Precision: 1.00
Recall: 0.83
F1 Score: 0.90
Confusion Matrix:
              Predicted Ham  Predicted Spam
Actual Ham             966               0
Actual Spam             26             123


In [8]:
# deffer Naive Bayes model and random forest model And Logistic Regression for (acc, prec, rec, f1)
print(f"""Isbarbar dhig sedexdaModel \nLR at F1 Score : {lr.f1:.2f} vs RF at F1 Score : {rl.f1:.2f} vs NB at F1 Score : {nb.f1:.2f}\n
      LR at Accuracy : {lr.acc:.2f} vs RF at Accuracy : {rl.acc:.2f} vs NB at Accuracy : {nb.acc:.2f}\n
      LR at Precision : {lr.prec:.2f} vs RF at Precision : {rl.prec:.2f} vs NB at Precision : {nb.prec:.2f}\n
      LR at Recall : {lr.rec:.2f} vs RF at Recall : {rl.rec:.2f} vs NB at Recall : {nb.rec:.2f}\n
      LR at Confusion Matrix :\n{lr.cd}\n
      RF at Confusion Matrix :\n{rl.cd}\n
      NB at Confusion Matrix :\n{nb.cd}\n
      """)

Isbarbar dhig sedexdaModel 
LR at F1 Score : 0.86 vs RF at F1 Score : 0.92 vs NB at F1 Score : 0.90

      LR at Accuracy : 0.97 vs RF at Accuracy : 0.98 vs NB at Accuracy : 0.98

      LR at Precision : 1.00 vs RF at Precision : 1.00 vs NB at Precision : 1.00

      LR at Recall : 0.76 vs RF at Recall : 0.86 vs NB at Recall : 0.83

      LR at Confusion Matrix :
[[966   0]
 [ 36 113]]

      RF at Confusion Matrix :
[[966   0]
 [ 21 128]]

      NB at Confusion Matrix :
[[966   0]
 [ 26 123]]

      


In [9]:
# Single-message sanity check
print("\nSingle-message predictions for i = 14 :")
i = 14 # change index to inspect different emails from x_test
sample_text = x_test.iloc[i]
true_label  = y_test.iloc[i]

# Predict with both models
lr_pred_one = int(logreg.predict(tfidf.transform([sample_text]))[0])
rf_pred_one = int(ranfor.predict(tfidf.transform([sample_text]).toarray())[0])
nb_pred_one = int(naBay.predict(tfidf.transform([sample_text]).toarray())[0])

def lab2str(v):  # same readable output style
    return "Spam" if v == 0 else "Ham"
print(lab2str(true_label))


Single-message predictions for i = 14 :
Spam


In [10]:
print("\nSingle-message predictions for i = 21 :")
i = 21 # change index to inspect different emails from x_test
sample_text = x_test.iloc[i]
true_label  = y_test.iloc[i]

# Predict with both models
lr_pred_one = int(logreg.predict(tfidf.transform([sample_text]))[0])
rf_pred_one = int(ranfor.predict(tfidf.transform([sample_text]).toarray())[0])
nb_pred_one = int(naBay.predict(tfidf.transform([sample_text]).toarray())[0])

print(lab2str(true_label))

snippet = (sample_text[:80] + "...") if len(sample_text) > 80 else sample_text
print(f"Text: {snippet}")
print(f"True Label: {lab2str(true_label)}")
print(f"LR Prediction: {lab2str(lr_pred_one)}")
print(f"RF Prediction: {lab2str(rf_pred_one)}")
print(f"NB Prediction: {lab2str(nb_pred_one)}")


Single-message predictions for i = 21 :
Ham
Text: Customer place i will call you
True Label: Ham
LR Prediction: Ham
RF Prediction: Ham
NB Prediction: Ham


In [15]:
# Own message for prediction

def Own_message_pred(message):
    message_tfidf = tfidf.transform([message])
    logreg_pred = logreg.predict(message_tfidf)
    ranfor_pred = ranfor.predict(message_tfidf)
    naBay_pred = naBay.predict(message_tfidf)
    return f"""\nOwn message for prediction: {message}\nLogistic Regression Prediction: {'Spam' if logreg_pred[0] == 0 else 'Ham'}\nRandom Forest Prediction: {'Spam' if ranfor_pred[0] == 0 else 'Ham'}\nNaive Bayes Prediction: {'Spam' if naBay_pred[0] == 0 else 'Ham'}"""
# Example usage of Own_message_pred function
print(Own_message_pred("Asc Eng Message-ka wuu fiican yahay, mahadsanid!"))
print(Own_message_pred("Congratulations! You've won a free Courses on IA and ML for Omer Tood and Sharfu diin, claim now."))
# # Example 3
print(Own_message_pred("Free entry in 2 a weekly competition!"))

# # Example 4
print(Own_message_pred("I will meet you at the cafe tomorrow"))

# # Example 5
print(Own_message_pred("Congratulations, you won a free ticket to Bahamas! Click here."))


Own message for prediction: Asc Eng Message-ka wuu fiican yahay, mahadsanid!
Logistic Regression Prediction: Ham
Random Forest Prediction: Ham
Naive Bayes Prediction: Ham

Own message for prediction: Congratulations! You've won a free Courses on IA and ML for Omer Tood and Sharfu diin, claim now.
Logistic Regression Prediction: Spam
Random Forest Prediction: Spam
Naive Bayes Prediction: Spam

Own message for prediction: Free entry in 2 a weekly competition!
Logistic Regression Prediction: Ham
Random Forest Prediction: Ham
Naive Bayes Prediction: Spam

Own message for prediction: I will meet you at the cafe tomorrow
Logistic Regression Prediction: Ham
Random Forest Prediction: Ham
Naive Bayes Prediction: Ham

Own message for prediction: Congratulations, you won a free ticket to Bahamas! Click here.
Logistic Regression Prediction: Ham
Random Forest Prediction: Ham
Naive Bayes Prediction: Spam
