# Naive Bayes classifer

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from fontTools.varLib import load_designspace
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


In [2]:
df = pd.read_csv('emails.csv')

In [3]:
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [4]:
len(df.columns)

3002

In [5]:
df.shape[0]

5172

In [6]:
target = df.iloc[:,-1]
target.head() # so the name for tha target column is Predictin 1 for spam and 0 for not spam

0    0
1    0
2    0
3    0
4    0
Name: Prediction, dtype: int64

In [7]:
df['Prediction'].value_counts() # 0 is not spam and 1 is spam

Prediction
0    3672
1    1500
Name: count, dtype: int64

In [8]:
total_spam = df['Prediction'].value_counts()[1]
total_ham = df['Prediction'].value_counts()[0]

In [9]:
print(f"Total spam emails: {total_spam}")
print(f"Total ham emails: {total_ham}")

Total spam emails: 1500
Total ham emails: 3672


In [10]:
df.columns

Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)

In [11]:
tot_emails = len(df.index)
tot_emails

5172

In [12]:
p_of_spam = round(df['Prediction'].value_counts()[1] / tot_emails,2)
p_of_ham = round(df['Prediction'].value_counts()[0] / tot_emails,2)

In [13]:
print(f"Probability of spam: {p_of_spam}")
print(f"Probability of ham: {p_of_ham}")

Probability of spam: 0.29
Probability of ham: 0.71


In [14]:
spam_row = df[df['Prediction'] == 1]
spam_row.head()


Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
5,Email 6,4,5,1,4,2,3,45,1,0,...,0,0,0,0,0,0,0,0,0,1
7,Email 8,0,2,2,3,1,2,21,6,0,...,0,0,0,0,0,0,0,1,0,1
16,Email 17,3,1,2,2,0,1,17,0,0,...,0,0,0,0,0,0,0,1,0,1
17,Email 18,36,21,6,14,7,17,194,25,5,...,0,0,0,0,0,0,0,3,0,1
25,Email 26,12,53,2,14,18,14,287,0,2,...,0,0,0,0,0,0,0,6,0,1


In [15]:
X = df.drop(columns=['Email No.','Prediction'])
X

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,0,1,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,0,1,0


In [16]:
y = df['Prediction']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
X_train


Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
3164,6,4,2,1,2,8,39,5,2,5,...,0,0,0,0,0,0,0,0,1,0
2067,0,0,1,0,1,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4717,40,28,4,21,12,12,260,24,4,78,...,0,0,0,0,0,0,0,0,3,0
2505,5,1,1,0,1,1,12,0,0,3,...,0,0,0,0,0,0,0,0,0,0
2268,6,6,3,1,1,0,64,0,1,7,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4426,0,3,2,0,2,0,16,0,0,1,...,0,0,0,0,0,0,0,0,0,0
466,4,5,1,0,5,1,28,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3092,0,0,1,0,1,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3772,2,11,1,6,4,5,58,10,0,12,...,0,0,0,0,0,0,0,0,3,0


In [132]:
from sklearn.naive_bayes import MultinomialNB
classifer = MultinomialNB()
classifer.fit(X_train, y_train)

In [133]:
y_pred = classifer.predict(X_test)

In [134]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97       739
           1       0.89      0.96      0.92       296

    accuracy                           0.95      1035
   macro avg       0.94      0.96      0.95      1035
weighted avg       0.96      0.95      0.96      1035



In [135]:
print(accuracy_score(y_test, y_pred))

0.9545893719806763


In [4]:
import joblib
from sklearn.feature_extraction.text import CountVectorizer

In [138]:
vocab = list(X.columns)
vocab

['the',
 'to',
 'ect',
 'and',
 'for',
 'of',
 'a',
 'you',
 'hou',
 'in',
 'on',
 'is',
 'this',
 'enron',
 'i',
 'be',
 'that',
 'will',
 'have',
 'with',
 'your',
 'at',
 'we',
 's',
 'are',
 'it',
 'by',
 'com',
 'as',
 'from',
 'gas',
 'or',
 'not',
 'me',
 'deal',
 'if',
 'meter',
 'hpl',
 'please',
 're',
 'e',
 'any',
 'our',
 'corp',
 'can',
 'd',
 'all',
 'has',
 'was',
 'know',
 'need',
 'an',
 'forwarded',
 'new',
 't',
 'may',
 'up',
 'j',
 'mmbtu',
 'should',
 'do',
 'am',
 'get',
 'out',
 'see',
 'no',
 'there',
 'price',
 'daren',
 'but',
 'been',
 'company',
 'l',
 'these',
 'let',
 'so',
 'would',
 'm',
 'into',
 'xls',
 'farmer',
 'attached',
 'us',
 'information',
 'they',
 'message',
 'day',
 'time',
 'my',
 'one',
 'what',
 'only',
 'http',
 'th',
 'volume',
 'mail',
 'contract',
 'which',
 'month',
 'more',
 'robert',
 'sitara',
 'about',
 'texas',
 'nom',
 'energy',
 'pec',
 'questions',
 'www',
 'deals',
 'volumes',
 'pm',
 'ena',
 'now',
 'their',
 'file',
 's

In [6]:
from sklearn.base import TransformerMixin,BaseEstimator
class ToDataFrameTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X.toarray(), columns=self.columns)

In [155]:
vectorizer = CountVectorizer(vocabulary=vocab)
to_df = ToDataFrameTransformer(columns=vocab)

In [149]:
X_new = vectorizer.transform([sample_email])
X_new

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (1, 3000)>

In [152]:
X_new_df = pd.DataFrame(X_new.toarray(), columns=vocab)
X_new_df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [156]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(vectorizer,to_df,classifer)
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']

In [49]:
loaded_pipeline = joblib.load('pipeline.pkl')

In [43]:
def predict_email(email):
    prediction = loaded_pipeline.predict([email])
    if prediction[0] == 1:
        return "The email is spam."
    else:
        return "The email is not spam."

In [52]:
spam_sample_email = """ Subject: 🎁 Congratulations! You've won a $1000 Gift Card!

Dear User,

You have been selected as the winner of a $1000 Walmart gift card!
To claim your prize, simply click the link below and complete the short survey.

👉 [Click here to claim your reward](https://giveawaywinner.com)

Hurry! This offer is valid for the next 24 hours only.

Best regards,
Rewards Team
 """

In [51]:
predict_email(spam_sample_email)

'The email is spam.'

In [46]:
non_spam_sample_email="""Subject: Meeting Reminder

Hi Nadin,

This is a friendly reminder about our scheduled meeting tomorrow at 10:00 AM in the conference room. Please let me know if you have any questions or need to reschedule.

Looking forward to seeing you there!

Best regards,
[Your Name]"""

In [47]:
predict_email(non_spam_sample_email)

'The email is not spam.'