**Loading dataset and normalize**

In [5]:
import pandas as pd

df = pd.read_csv("data/enron_spam_data.csv")

# Combine Subject + Message into one text field
df["text"] = df["Subject"].fillna("") + " " + df["Message"].fillna("")

df["label"] = df["Spam/Ham"].str.lower()
df["label"].value_counts()


label
spam    17171
ham     16545
Name: count, dtype: int64

**Clean text (remove headers, html, punctuation, etc.)**

In [6]:
import re
import string

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<[^>]+>', ' ', text)         # remove HTML
    text = re.sub(r'http\S+|www\S+', ' ', text)  # remove URLs
    text = re.sub(r'\d+', ' ', text)             # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)


**Train-test split**

In [7]:
from sklearn.model_selection import train_test_split

X = df["clean_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


**step 5**

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

tfidf = TfidfVectorizer(
    stop_words="english",   # remove English stopwords
    max_features=5000,      # limit vocabulary size (can change later)
    ngram_range=(1, 2)      # unigrams + bigrams
)

# fit on train data and transform both train and test
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Test TF-IDF shape:", X_test_tfidf.shape)



Train TF-IDF shape: (26972, 5000)
Test TF-IDF shape: (6744, 5000)


naive bayes

In [12]:
nb_model = MultinomialNB()

# train (fit) on TF-IDF features
nb_model.fit(X_train_tfidf, y_train)

# predict on the test set
y_pred_nb = nb_model.predict(X_test_tfidf)

print("Naïve Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nClassification report:\n", classification_report(y_test, y_pred_nb))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_nb))



Naïve Bayes
Accuracy: 0.9819098457888493

Classification report:
               precision    recall  f1-score   support

         ham       1.00      0.96      0.98      3309
        spam       0.97      1.00      0.98      3435

    accuracy                           0.98      6744
   macro avg       0.98      0.98      0.98      6744
weighted avg       0.98      0.98      0.98      6744

Confusion matrix:
 [[3187  122]
 [   0 3435]]


Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=2000, n_jobs=-1)

lr_model.fit(X_train_tfidf, y_train)

y_pred_lr = lr_model.predict(X_test_tfidf)

print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification report:\n", classification_report(y_test, y_pred_lr))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))


=== Logistic Regression ===
Accuracy: 0.9985172004744959

Classification report:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3309
        spam       1.00      1.00      1.00      3435

    accuracy                           1.00      6744
   macro avg       1.00      1.00      1.00      6744
weighted avg       1.00      1.00      1.00      6744

Confusion matrix:
 [[3299   10]
 [   0 3435]]


svm

In [15]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()

svm_model.fit(X_train_tfidf, y_train)

y_pred_svm = svm_model.predict(X_test_tfidf)

print("=== SVM (LinearSVC) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification report:\n", classification_report(y_test, y_pred_svm))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_svm))


=== SVM (LinearSVC) ===
Accuracy: 0.9992586002372479

Classification report:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3309
        spam       1.00      1.00      1.00      3435

    accuracy                           1.00      6744
   macro avg       1.00      1.00      1.00      6744
weighted avg       1.00      1.00      1.00      6744

Confusion matrix:
 [[3304    5]
 [   0 3435]]


random forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,     # number of trees
    max_depth=None,      # let trees grow fully (you can tune later)
    n_jobs=-1,           # use all CPU cores
    random_state=42
)

rf_model.fit(X_train_tfidf, y_train)

y_pred_rf = rf_model.predict(X_test_tfidf)

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification report:\n", classification_report(y_test, y_pred_rf))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))


=== Random Forest ===
Accuracy: 0.9994068801897983

Classification report:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3309
        spam       1.00      1.00      1.00      3435

    accuracy                           1.00      6744
   macro avg       1.00      1.00      1.00      6744
weighted avg       1.00      1.00      1.00      6744

Confusion matrix:
 [[3305    4]
 [   0 3435]]
