In [None]:

import os
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)


RANDOM_STATE = 42
CSV_PATH = "mail_l7_dataset.csv" 

# -----------------------------
# Load & clean
# -----------------------------
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV not found: {CSV_PATH}")

# Expecting columns: Category (spam/ham or 0/1), Message (text)
df = pd.read_csv(CSV_PATH)

# Replace NaNs
df = df.where(pd.notnull(df), "")

# Encode labels => spam=0, ham=1 (handles already numeric too)
if df["Category"].dtype == "O":
    low = df["Category"].str.lower().str.strip()
    df.loc[low == "spam", "Category"] = 0
    df.loc[low == "ham",  "Category"] = 1

df["Category"] = pd.to_numeric(df["Category"], errors="coerce")
before = len(df)
df = df.dropna(subset=["Category"]).copy()
df["Category"] = df["Category"].astype(int)
dropped = before - len(df)
if dropped:
    print(f"Dropped {dropped} rows due to unmapped Category labels.")

# -----------------------------
# Split X/y
# -----------------------------
X = df["Message"].astype(str)
y = df["Category"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# -----------------------------
# TF-IDF (unigrams + bigrams)
# -----------------------------
tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words="english",   # remove if your data isn't English
    ngram_range=(1, 2),
    min_df=2
)

X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

# -----------------------------
# Models
# -----------------------------
# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
y_pred_lr = lr.predict(X_test_features)

# Random Forest (dense)
rf = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train_features.toarray(), y_train)
y_pred_rf = rf.predict(X_test_features.toarray())

# Naive Bayes (Multinomial)
nb = MultinomialNB()
nb.fit(X_train_features, y_train)
y_pred_nb = nb.predict(X_test_features)

# -----------------------------
# output results
# -----------------------------
def report_block(name, y_true, y_pred, pos_label=0):
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label, zero_division=0)
    rec  = recall_score(y_true, y_pred, pos_label=pos_label, zero_division=0)
    f1   = f1_score(y_true, y_pred, pos_label=pos_label, zero_division=0)
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])  # [[TN, FP], [FN, TP]]

    a, b, c, d = int(cm[0,0]), int(cm[0,1]), int(cm[1,0]), int(cm[1,1])

    print(f"{name}:")
    print(f"  Accuracy  : {acc:.2f}")
    print(f"  Precision : {prec:.2f}")
    print(f"  Recall    : {rec:.2f}")
    print(f"  F1-Score  : {f1:.2f}")
    print("  Confusion Matrix:")
    print(f"    [[{a:>3} {b:>3}]")
    print(f"     [ {c:>3} {d:>3}]]")

print("Logistic Regression Performance")
report_block("Logistic Regression Performance", y_test, y_pred_lr, pos_label=0)
print()
print("Random Forest Performance")
report_block("Random Forest Performance", y_test, y_pred_rf, pos_label=0)
print()
print("Naive Bayes Performance")
report_block("Naive Bayes Performance", y_test, y_pred_nb, pos_label=0)

# -----------------------------
# metrics JSON output
# -----------------------------
def gather_metrics(y_true, y_pred, pos_label=0):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    return {
        "accuracy":  float(accuracy_score(y_true, y_pred)),
        "precision": float(precision_score(y_true, y_pred, pos_label=pos_label, zero_division=0)),
        "recall":    float(recall_score(y_true, y_pred, pos_label=pos_label, zero_division=0)),
        "f1":        float(f1_score(y_true, y_pred, pos_label=pos_label, zero_division=0)),
        "confusion_matrix": cm.tolist()
    }

results = {
    "logistic_regression": gather_metrics(y_test, y_pred_lr, pos_label=0),
    "random_forest":       gather_metrics(y_test, y_pred_rf, pos_label=0),
    "naive_bayes":         gather_metrics(y_test, y_pred_nb, pos_label=0)
}
with open("spam_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

# -----------------------------
# Sanity-check messages (3 required)
# -----------------------------
def label_to_text(v):
    return "Ham" if int(v) == 1 else "Spam"

print("\nPredictions for 3 sample test messages with labels: Ham or Spam.")
tests = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket",
]
for t in tests:
    v = tfidf.transform([t])
    p_lr = lr.predict(v)[0]
    p_rf = rf.predict(v.toarray())[0]
    p_nb = nb.predict(v)[0]
    print(f"\nText: {t!r}")
    print(f"  Logistic Regression: {label_to_text(p_lr)}")
    print(f"  Random Forest      : {label_to_text(p_rf)}")
    print(f"  Naive Bayes        : {label_to_text(p_nb)}")
