# Email Spam Detection

# Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("Dataset I.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam,test_size=0.25)

In [6]:
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train.values)
X_test_count = vectorizer.transform(X_test)
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Naive Bayes Model

In [7]:
nb_model = MultinomialNB()
nb_model.fit(X_train_count,y_train)

In [8]:
nb_model.score(X_test_count, y_test)

0.9820531227566404

In [9]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = vectorizer.transform(emails)
nb_model.predict(emails_count)

array([0, 1], dtype=int64)

# Random Forest Model

In [10]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_count, y_train)

In [11]:
rf_model.score(X_test_count, y_test)

0.9741564967695621

In [12]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = vectorizer.transform(emails)
rf_model.predict(emails_count)

array([0, 0], dtype=int64)

# Naive Bayes vs Random Forest Evaluation

In [13]:
# Predictions
nb_preds = nb_model.predict(X_test_count)
rf_preds = rf_model.predict(X_test_count)

In [14]:
# Evaluation
print("Naïve Bayes Accuracy:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds))

print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))

Naïve Bayes Accuracy: 0.9820531227566404
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1199
           1       0.98      0.89      0.93       194

    accuracy                           0.98      1393
   macro avg       0.98      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393

Random Forest Accuracy: 0.9741564967695621
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1199
           1       1.00      0.81      0.90       194

    accuracy                           0.97      1393
   macro avg       0.99      0.91      0.94      1393
weighted avg       0.97      0.97      0.97      1393



# SKLearn Pipeline

In [15]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [16]:
clf.fit(X_train, y_train)

In [17]:
clf.score(X_test,y_test)

0.9820531227566404

In [18]:
clf.predict(emails)

array([0, 1], dtype=int64)

# Flask API

In [19]:
import pickle

In [20]:
# Save Spam Detection Model
with open("spam_model.pkl", "wb") as f:
    pickle.dump(nb_model, f)

In [21]:
# Save Count Vectorizer
with open("spam_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)