In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

ModuleNotFoundError: No module named 'numpy'

In [None]:
RANDOM_STATE = 42

In [None]:
df = pd.read_csv("mail_l7_dataset.csv")
df = df.where(pd.notnull(df), "")

df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

print(df.head())

  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [None]:
X = df["Message"].astype(str)
y = df["Category"].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print("=== SPLIT SIZES ===")
print("Train:", X_train.shape[0], " | Test:", X_test.shape[0])

=== SPLIT SIZES ===
Train: 4457  | Test: 1115


In [None]:
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

print("\n=== TF-IDF SHAPES ===")
print("X_train:", X_train_features.shape, " | X_test:", X_test_features.shape)


=== TF-IDF SHAPES ===
X_train: (4457, 7473)  | X_test: (1115, 7473)


In [None]:

lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)

rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features.toarray(), y_train)  # RF needs dense input
rf_pred = rf.predict(X_test_features.toarray())

nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_pred = nb.predict(X_test_features)
print_clf_metrics("Naive Bayes", y_test, nb_pred, pos_label=0)
print_confmat("Naive Bayes", y_test, nb_pred)

NameError: name 'print_clf_metrics' is not defined

In [None]:
def print_model_report(name, y_true, y_pred, pos_label=0):
    """
    Print Accuracy, Precision, Recall, F1 + Confusion Matrix
    pos_label=0 → spam considered positive.
    """
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec  = recall_score(y_true, y_pred, pos_label=pos_label)
    f1   = f1_score(y_true, y_pred, pos_label=pos_label)
    
    print(f"\n=== {name} Performance ===")
    print(f"Accuracy : {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall   : {rec:.3f}")
    print(f"F1 Score : {f1:.3f}")
    
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])  # spam=0, ham=1
    cm_df = pd.DataFrame(
        cm,
        index   = ["Actual: Spam (0)", "Actual: Ham (1)"],
        columns = ["Pred: Spam (0)",  "Pred: Ham (1)"]
    )
    print(f"\n{name} Confusion Matrix:\n{cm_df}")

In [None]:
print_model_report("Logistic Regression", y_test, lr_pred, pos_label=0)
print_model_report("Random Forest", y_test, rf_pred, pos_label=0)
print_model_report("Naive Bayes", y_test, nb_pred, pos_label=0)

In [None]:
sample_messages = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket"
]

def lab2str(v):
    return "Spam (0)" if v == 0 else "Ham (1)"

print("\n=== SANITY CHECK ===")
for text in sample_messages:
    print("\nMessage:", text)
    lr_out = lab2str(int(lr.predict(tfidf.transform([text]))[0]))
    rf_out = lab2str(int(rf.predict(tfidf.transform([text]).toarray())[0]))
    nb_out = lab2str(int(nb.predict(tfidf.transform([text]))[0]))
    print("Logistic Regression:", lr_out)
    print("Random Forest      :", rf_out)
    print("Naive Bayes        :", nb_out)