In [None]:
import sys, os
sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("../src"))
os.chdir(os.path.abspath(".."))


In [None]:
import pandas as pd
from load_emails import main as load_all

import load_emails

df = pd.DataFrame(load_emails.load_folder(load_emails.SPAM_DIR, "spam") + load_emails.load_folder(load_emails.HAM_DIR, "ham"))

print(df.head())
print(df.shape)

In [None]:
df['label'].value_counts()

In [None]:
df["body_length"] = df["body"].str.len()
df["body_length"].describe()

In [None]:
print("=== SPAM SAMPLE ===")
print(df[df["label"] == "spam"].sample(1, random_state=1)["body"].values[0])

print("\n=== HAM SAMPLE ===")
print(df[df["label"] == "ham"].sample(1, random_state=1)["body"].values[0])


In [None]:
from collections import Counter
import re

def tokenize(text):
    return re.findall(r"[A-Za-z']+", text.lower())

spam_words = Counter()
ham_words = Counter()

for txt in df[df["label"]=="spam"]["body"].dropna():
    spam_words.update(tokenize(txt))

for txt in df[df["label"]=="ham"]["body"].dropna():
    ham_words.update(tokenize(txt))

print("Top spam words:", spam_words.most_common(20))
print("\nTop ham words:", ham_words.most_common(20))


In [None]:
(df["body"].str.len() == 0).sum()

In [None]:
df["text"] = df["subject"].fillna("") + " " + df["body"].fillna("")
df["text"].str.len().describe()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words=None,    # keep stopwords for now, TF-IDF will handle it
    ngram_range=(1, 2), # unigrams + bigrams (very important for spam)
    max_features=5000   # keeps vector size manageable
)

X = vectorizer.fit_transform(df["text"])

X.shape


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# target labels
y = df["label"]

# split (stratify to preserve spam/ham ratio)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# train model
clf = MultinomialNB()
clf.fit(X_train, y_train)

# predict
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(max_iter=2000)

lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print(classification_report(y_test, y_pred_lr))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred_lr, labels=["ham", "spam"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["ham", "spam"])
disp.plot(cmap="Blues")
plt.show()
