**SPAM EMAIL DETECTOR**

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:

df = pd.read_csv("spam_clean.csv", encoding='latin-1')
df.head()


Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
from sklearn.model_selection import train_test_split

X = df["text"].astype(str).values
y = df["label"].astype(int).values

# 70% train, 15% val, 15% test (all stratified)
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.50, stratify=y_tmp, random_state=42
)

len(X_train), len(X_val), len(X_test), y.mean(), y_train.mean(), y_val.mean(), y_test.mean()


(3610,
 774,
 774,
 np.float64(0.12446684761535479),
 np.float64(0.12437673130193906),
 np.float64(0.12532299741602068),
 np.float64(0.12403100775193798))

In [None]:
# TF-IDF + Multinomial Naive Bayes (baseline on VALIDATION set)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Build a simple, strong baseline:
# - TF-IDF with unigrams + bigrams
# - Ignore tokens that appear < 2 docs (min_df=2) to reduce noise

pipe_nb = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1, 2),
        min_df=2
    )),
    ("nb", MultinomialNB())
])

# Train on TRAIN split only
pipe_nb.fit(X_train, y_train)

# Get probabilities for VALIDATION
proba_val = pipe_nb.predict_proba(X_val)[:, 1]

# Default decision threshold = 0.50
pred_val_default = (proba_val >= 0.5).astype(int)

print("Validation @ threshold = 0.50")
print(classification_report(y_val, pred_val_default, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_val, pred_val_default))

# Threshold-agnostic metric (useful to compare models later)
print("ROC-AUC (validation):", roc_auc_score(y_val, proba_val))

# quick peek at vocabulary size
print("Vocabulary size:", len(pipe_nb.named_steps["tfidf"].vocabulary_))


Validation @ threshold = 0.50
              precision    recall  f1-score   support

           0     0.9562    1.0000    0.9776       677
           1     1.0000    0.6804    0.8098        97

    accuracy                         0.9599       774
   macro avg     0.9781    0.8402    0.8937       774
weighted avg     0.9617    0.9599    0.9566       774

Confusion matrix:
 [[677   0]
 [ 31  66]]
ROC-AUC (validation): 0.9808433202881116
Vocabulary size: 8699


In [None]:
# Threshold tuning
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, precision_recall_curve

# Find best F1 threshold on validation
thresholds = np.linspace(0, 1, 101)
f1s = [f1_score(y_val, (proba_val >= t).astype(int)) for t in thresholds]
best_t = float(thresholds[int(np.argmax(f1s))])

def show_metrics(name, t):
    pred = (proba_val >= t).astype(int)
    print(f"\n=== {name} (t={t:.2f}) ===")
    print(f"precision={precision_score(y_val, pred):.3f}  recall={recall_score(y_val, pred):.3f}  f1={f1_score(y_val, pred):.3f}")
    print("confusion matrix:\n", confusion_matrix(y_val, pred))

# Target a high recall option (~0.90)
prec, rec, th = precision_recall_curve(y_val, proba_val)  # th aligns with rec[:-1]/prec[:-1]
t_recall = best_t
for r, tt in zip(rec[:-1], th):
    if r >= 0.90:
        t_recall = float(tt); break

# Show three useful operating points
show_metrics("default", 0.50)
show_metrics("best_F1", best_t)
show_metrics("recall≈0.90", t_recall)

best_t  # keep this for next



=== default (t=0.50) ===
precision=1.000  recall=0.680  f1=0.810
confusion matrix:
 [[677   0]
 [ 31  66]]

=== best_F1 (t=0.13) ===
precision=0.989  recall=0.887  f1=0.935
confusion matrix:
 [[676   1]
 [ 11  86]]

=== recall≈0.90 (t=0.00) ===
precision=0.125  recall=1.000  f1=0.223
confusion matrix:
 [[  0 677]
 [  0  97]]


0.13

In [None]:
# Final evaluation on TEST set (using t from Cell 4)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

t = 0.13  # chosen from Cell 4 (best_F1)

proba_test = pipe_nb.predict_proba(X_test)[:, 1]
pred_test = (proba_test >= t).astype(int)

print(f"Test-set performance @ threshold={t:.2f}")
print(classification_report(y_test, pred_test, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, pred_test))
print("ROC-AUC (test):", roc_auc_score(y_test, proba_test))
print("PR-AUC  (test):", average_precision_score(y_test, proba_test))


Test-set performance @ threshold=0.13
              precision    recall  f1-score   support

           0     0.9797    0.9971    0.9883       678
           1     0.9762    0.8542    0.9111        96

    accuracy                         0.9793       774
   macro avg     0.9780    0.9256    0.9497       774
weighted avg     0.9793    0.9793    0.9787       774

Confusion matrix:
 [[676   2]
 [ 14  82]]
ROC-AUC (test): 0.9819167895771878
PR-AUC  (test): 0.9489446078831076


# **Cell 6 — Inspect false negatives/false positives**
**Check spam emails we have missed from CELL 5**


In [None]:
import pandas as pd
import numpy as np

# Build a review table for test set
df_test = pd.DataFrame({
    "text": X_test,
    "y_true": y_test,
    "proba_spam": proba_test,
})
df_test["pred"] = (df_test["proba_spam"] >= t).astype(int)
df_test["correct"] = (df_test["pred"] == df_test["y_true"])
df_test["type"] = np.where(df_test["correct"], "correct",
                    np.where((df_test["y_true"]==1)&(df_test["pred"]==0),"FN","FP"))

print("Counts by type:\n", df_test["type"].value_counts())

# Top 10 False Negatives (missed spam) — highest probabilities (closest to threshold)
fns = df_test.query("type=='FN'").sort_values("proba_spam", ascending=False).head(10)
print("\n--- Top False Negatives (missed spam) ---")
for i, r in fns.iterrows():
    print(f"\nproba={r.proba_spam:.3f} | text: {r.text[:220]}")

# Top 10 False Positives (ham flagged spam) — lowest probabilities among FPs
fps = df_test.query("type=='FP'").sort_values("proba_spam", ascending=True).head(10)
print("\n--- Top False Positives (ham flagged spam) ---")
for i, r in fps.iterrows():
    print(f"\nproba={r.proba_spam:.3f} | text: {r.text[:220]}")


Counts by type:
 type
correct    758
FN          14
FP           2
Name: count, dtype: int64

--- Top False Negatives (missed spam) ---

proba=0.112 | text: Hi babe its Jordan, how r u? Im home from abroad and lonely, text me back if u wanna chat xxSP visionsms.com Text stop to stopCost 150p 08712400603

proba=0.103 | text: I want some cock! My hubby's away, I need a real man 2 satisfy me. Txt WIFE to 89938 for no strings action. (Txt STOP 2 end, txt rec Ã¥Â£1.50ea. OTBox 731 LA1 7WS. )

proba=0.095 | text: TheMob>Yo yo yo-Here comes a new selection of hot downloads for our members to get for FREE! Just click & open the next link sent to ur fone...

proba=0.088 | text: FreeMsg Hey U, i just got 1 of these video/pic fones, reply WILD to this txt & ill send U my pics, hurry up Im so bored at work xxx (18 150p/rcvd STOP2stop)

proba=0.070 | text: ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE MINS. INDIA CUST SERVs SED YES. L8ER GOT MEGA BILL. 3 DONT GIV A SHIT. BAILIFF DUE IN DAYS. I O Ã¥

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

for t_try in [0.13, 0.12, 0.11, 0.10]:
    pred = (proba_test >= t_try).astype(int)
    print(f"\n=== TEST @ t={t_try:.2f} ===")
    print(classification_report(y_test, pred, digits=4))
    print("confusion:\n", confusion_matrix(y_test, pred))



=== TEST @ t=0.13 ===
              precision    recall  f1-score   support

           0     0.9797    0.9971    0.9883       678
           1     0.9762    0.8542    0.9111        96

    accuracy                         0.9793       774
   macro avg     0.9780    0.9256    0.9497       774
weighted avg     0.9793    0.9793    0.9787       774

confusion:
 [[676   2]
 [ 14  82]]

=== TEST @ t=0.12 ===
              precision    recall  f1-score   support

           0     0.9796    0.9926    0.9861       678
           1     0.9425    0.8542    0.8962        96

    accuracy                         0.9755       774
   macro avg     0.9611    0.9234    0.9411       774
weighted avg     0.9750    0.9755    0.9749       774

confusion:
 [[673   5]
 [ 14  82]]

=== TEST @ t=0.11 ===
              precision    recall  f1-score   support

           0     0.9810    0.9897    0.9853       678
           1     0.9222    0.8646    0.8925        96

    accuracy                         0.9742

# **From cell 7 i can see that t=0.13 is still better in selection so i keep it with it**
**Saving my pipeline with threshold 0.13 so it will handle dynamically when ever i used my trained model for training saved data**

In [None]:
import joblib, json

# Use the NB pipeline i already trained: pipe_nb
# And my chosen threshold:
t = 0.13

joblib.dump(pipe_nb, "spam_nb_tfidf.joblib")
with open("threshold.json","w") as f:
    json.dump({"threshold": float(t)}, f)

print("Saved files:")
!ls -lh spam_nb_tfidf.joblib threshold.json


Saved files:
-rw-r--r-- 1 root root 457K Aug 12 10:14 spam_nb_tfidf.joblib
-rw-r--r-- 1 root root   19 Aug 12 10:14 threshold.json


In [None]:
import joblib, json

pipe = joblib.load("spam_nb_tfidf.joblib")
with open("threshold.json") as f:
    T = json.load(f)["threshold"]

print("Loaded threshold:", T)
print("Classes:", pipe.named_steps["nb"].classes_)
print("Mini test:", pipe.predict_proba(["Free entry in a weekly prize draw, claim now!"])[:,1][0])


Loaded threshold: 0.13
Classes: [0 1]
Mini test: 0.9213202911398749


In [None]:
import gradio as gr, joblib, json

pipe = joblib.load("spam_nb_tfidf.joblib")
with open("threshold.json") as f:
    T = json.load(f)["threshold"]

def predict_sms(msg: str):
    msg = (msg or "").strip()
    if not msg:
        return {"—": 1.0}
    p = float(pipe.predict_proba([msg])[0,1])
    return {"SPAM": p, "HAM": 1 - p}

demo = gr.Interface(
    fn=predict_sms,
    inputs=gr.Textbox(lines=4, label="Message"),
    outputs=gr.Label(label="Prediction", num_top_classes=2),
    title="Spam Detector (TF-IDF + Naive Bayes)",
    description=f"Decision threshold t = {T:.2f} (used internally for decisions)",
    examples=[
        ["Free entry! Claim now by clicking the link"],
        ["Dinner at 8 tonight?"],
        ["Your OTP is 274913. Do not share it."]
    ]
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e99ce7fdaeef081600.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


