In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score,confusion_matrix
from sklearn.naive_bayes import MultinomialNB



In [119]:
Path = "mail_l7_dataset.csv"
df=pd.read_csv(Path)
df=df.where(pd.notnull(df),"")
df.loc[df['Category'].str.lower().str.strip()=='spam', 'Category']=0
df.loc[df['Category'].str.lower().str.strip()=='ham', 'Category']=1
x=df["Message"].astype(str)
y=df["Category"].astype(int)

x_train,x_test,y_train,y_test=train_test_split(
    x,y,test_size=0.2 ,random_state=42
)
print("split-----------------------------------")
print("Training Data",x_train.shape[0],"|", "Testing Data" ,x_test.shape[0],)

tdidf=TfidfVectorizer(min_df=1, stop_words='english',lowercase=True)
X_train_features = tdidf.fit_transform(x_train)
X_test_features  = tdidf.transform(x_test)


model = MultinomialNB()
model.fit(X_train_features, y_train)
naive_pr = model.predict(X_test_features)


lr=LogisticRegression(max_iter=1000,random_state=42 )
lr.fit(X_train_features,y_train)
lr_Predict=lr.predict(X_test_features)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_features, y_train)
rf_pred = rf.predict(X_test_features.toarray())

def show_metrics(name, y_true, y_pred, positive_class=0):  # <-- renamed parameter
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=positive_class)  # pass it as pos_label
    rec  = recall_score(y_true, y_pred, pos_label=positive_class)
    f1   = f1_score(y_true, y_pred, pos_label=positive_class)

    print(f"\n{name} Performance:")
    print(f"  Accuracy  : {acc:.3f}")
    print(f"  Precision : {prec:.3f}")
    print(f"  Recall    : {rec:.3f}")
    print(f"  F1-Score  : {f1:.3f}")


def confussion(name, y_true, y_pred):
    
   
      cm = confusion_matrix(y_true, y_pred)
      cm_df = pd.DataFrame(
        cm,
        index   = ["Actual: Ham (1)",  "Actual: Spam (0)"],
        columns = ["Pred: Ham (1)",    "Pred: Spam (0)"]
    )
      print(f"\n{name} – Confusion Matrix:\n{cm_df}")

    
show_metrics("Logistic Regression", y_test,lr_Predict)

show_metrics("Random Forest", y_test,rf_pred)
show_metrics("naive_bayes", y_test,naive_pr)
confussion("Logistic Regression", y_test, lr_Predict)
confussion("Random Forest", y_test, rf_pred)
confussion("naive_bayes", y_test, naive_pr)




i =88
sample_text = x_test.iloc[i]
true_label  = y_test.iloc[i]
lr_pred_one = int(lr.predict(tdidf.transform([sample_text]))[0])
lr_pred_one = int(lr.predict(tdidf.transform([sample_text]))[0])
model_pred_one = int(model.predict(tdidf.transform([sample_text]).toarray())[0])

def value(v):  # same readable output style
    return "Spam (0)" if v == 0 else "Ham (1)"
print("\n=== SINGLE MESSAGE CHECK ===")
snippet = (sample_text[:160] + "...") if len(sample_text) > 160 else sample_text
print("Text snippet:", snippet)
print("Actual      :", value(true_label))
print("LR Pred     :", value(lr_pred_one))
print("RF Pred     :", value(rf_pred_one))
print("nv Pred     :", value(model_pred_one))
print(df.head(20))



split-----------------------------------
Training Data 4457 | Testing Data 1115

Logistic Regression Performance:
  Accuracy  : 0.968
  Precision : 1.000
  Recall    : 0.758
  F1-Score  : 0.863

Random Forest Performance:
  Accuracy  : 0.983
  Precision : 1.000
  Recall    : 0.872
  F1-Score  : 0.932

naive_bayes Performance:
  Accuracy  : 0.977
  Precision : 1.000
  Recall    : 0.826
  F1-Score  : 0.904

Logistic Regression – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             113              36
Actual: Spam (0)              0             966

Random Forest – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             130              19
Actual: Spam (0)              0             966

naive_bayes – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             123              26
Actual: Spam (0)              0             966

=== SINGLE MESSAGE CHECK ===
Text snippet: No mana