In [18]:
import pandas as pd
import numpy as np

In [217]:
df = pd.read_csv("enron_spam_data.csv")
df.head()

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14


In [219]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33716 entries, 0 to 33715
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Message ID  33716 non-null  int64 
 1   Subject     33427 non-null  object
 2   Message     33345 non-null  object
 3   Spam/Ham    33716 non-null  object
 4   Date        33716 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


In [224]:
df['email'] = df['Subject'].str.cat(df['Message'], sep = ' ', na_rep = '')
df = df.rename(columns={'Spam/Ham':'label'})

## Vectorising the Label

Encoding the output labels to either 0 or 1.

0 -> ham (not spam email) 
1 -> spam (spam email)

In [227]:
df['label_num'] = df.label.map({'ham':0, 'spam':1})

## Vectorising the emails using TF-IDF method 

In [296]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_features=100000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.95
)

count_vectorizer = CountVectorizer(max_features = 100000, ngram_range = (1, 2))
X_counts = count_vectorizer.fit_transform(df['email'])

# X_tfidf = tfidf_vectorizer.fit_transform(df['email'])
print("Shape of TF-IDF matrix:", X_tfidf.shape)

Shape of TF-IDF matrix: (33716, 100000)


# Splitting the data into train and test datasets

In [300]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    # X_tfidf,
    X_counts,
    df['label_num'],
    test_size = 0.2,
    random_state = 42,
    stratify = df['label_num']
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Train shape: (26972, 100000)
Test shape: (6744, 100000)
y_train shape: (26972,)
y_test shape: (6744,)


## Building the Naive Bayes model

Multinomial Naive Bayes is an excellent model for text classification.

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [324]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
model = MultinomialNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Get spam probabilities
y_probs = model.predict_proba(X_test)[:, 1]

# Set custom threshold
import numpy as np

threshold = 0.756  # Try higher thresholds to increase precision
y_pred_custom = (y_probs >= threshold).astype(int)

# Evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_custom, target_names=["spam", "ham"]))

              precision    recall  f1-score   support

        spam       0.99      0.99      0.99      3309
         ham       0.99      0.99      0.99      3435

    accuracy                           0.99      6744
   macro avg       0.99      0.99      0.99      6744
weighted avg       0.99      0.99      0.99      6744



In [304]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("\nClassification Report: ", classification_report(y_test, y_pred))
print("\nConfusion_matrix: ", confusion_matrix(y_test, y_pred))

Accuracy:  0.9903618030842231

Classification Report:                precision    recall  f1-score   support

           0       0.99      0.99      0.99      3309
           1       0.99      0.99      0.99      3435

    accuracy                           0.99      6744
   macro avg       0.99      0.99      0.99      6744
weighted avg       0.99      0.99      0.99      6744


Confusion_matrix:  [[3280   29]
 [  36 3399]]


In [306]:
import joblib

In [310]:
joblib.dump(model, 'spam_classifier_model.pkl')
# joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(count_vectorizer, 'count_vectorizer.pkl')

['count_vectorizer.pkl']