In [None]:
import sys
import numpy
import pandas
import scipy
import sklearn
import re
import urllib.request
import os
import tarfile
import pickle
import pandas as pd
import nltk

In [None]:
downloads = os.path.join(os.environ['HOME'] + "/Downloads")

In [None]:
url = "http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/"

enron_dir = os.path.join(downloads, 'Enron emails')

enron_files = ['enron1.tar.gz', 'enron2.tar.gz', 'enron3.tar.gz',
               'enron4.tar.gz', 'enron5.tar.gz', 'enron6.tar.gz']


In [None]:
def Get_Emails():
    if not os.path.exists(enron_dir):
        os.makedirs(enron_dir)
    for file in enron_files:
        path = os.path.join(enron_dir, file)
        if not os.path.exists(path):
            urllib.request.urlretrieve(url + file, path)

In [None]:
def extract_emails(fname):
    rows = []
    tfile = tarfile.open(fname, 'r:gz')
    for member in tfile.getmembers():
        if 'ham' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'message': row, 'class': 'ham'})
        if 'spam' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'message': row, 'class': 'spam'})
    tfile.close()
    return pd.DataFrame(rows)

In [None]:
def populate_df_and_pickle():
    if not os.path.exists(downloads + "/emails.pickle"):
        emails_df = pd.DataFrame({'message': [], 'class': []})
        for file in enron_files:
            unzipped_file = extract_emails(os.path.join(enron_dir, file))
            emails_df = emails_df.append(unzipped_file)
        emails_df.to_pickle(downloads + "/emails.pickle")


In [None]:
Get_Emails()
populate_df_and_pickle()


In [None]:
with open(downloads + '/emails.pickle', 'rb') as f:
    emails_df = pickle.load(f) 


emails_df['message'] = emails_df['message'].apply(lambda x: x.decode('latin-1'))

emails_df = emails_df.reset_index(drop=True)

emails_df['class'] = emails_df['class'].map({'spam':1, 'ham':0})



In [None]:
emails_df.head(20)

Unnamed: 0,message,class
0,Subject: christmas tree farm pictures\r\n,0
1,"Subject: vastar resources , inc .\r\ngary , pr...",0
2,Subject: calpine daily gas nomination\r\n- cal...,0
3,Subject: re : issue\r\nfyi - see note below - ...,0
4,Subject: meter 7268 nov allocation\r\nfyi .\r\...,0
5,"Subject: mcmullen gas for 11 / 99\r\njackie ,\...",0
6,"Subject: meter 1517 - jan 1999\r\ngeorge ,\r\n...",0
7,Subject: duns number changes\r\nfyi\r\n- - - -...,0
8,Subject: king ranch\r\nthere are two fields of...,0
9,Subject: re : entex transistion\r\nthanks so m...,0


In [None]:
from string import punctuation
import re

In [None]:
def clean_email(email):
    email = re.sub(r'http\S+', ' ', email)
    email = re.sub("\d+", " ", email)
    email = email.replace('\n', ' ')
    email = email.translate(str.maketrans("", "", punctuation))
    email = email.lower()
    return email

emails_df['message'] = emails_df['message'].apply(clean_email)

In [None]:
emails_df.head(20)

Unnamed: 0,message,class
0,subject christmas tree farm pictures\r,0
1,subject vastar resources inc \r gary product...,0
2,subject calpine daily gas nomination\r calpin...,0
3,subject re issue\r fyi see note below alrea...,0
4,subject meter nov allocation\r fyi \r ...,0
5,subject mcmullen gas for \r jackie \r sinc...,0
6,subject meter jan \r george \r i need the ...,0
7,subject duns number changes\r fyi\r ...,0
8,subject king ranch\r there are two fields of g...,0
9,subject re entex transistion\r thanks so much...,0


In [14]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet as wn

In [15]:
def preproces_text(email):
    words = ""
    # Create the stemmer.
    stemmer = SnowballStemmer("english")
    # Split text into words.
    email = email.split()
    for word in email:
        words = words + stemmer.stem(word) + " "
    
    return words

emails_df['message'] = emails_df['message'].apply(preproces_text)


In [16]:
emails_df.head(20)

Unnamed: 0,message,class
0,subject christma tree farm pictur,0
1,subject vastar resourc inc gari product from t...,0
2,subject calpin daili gas nomin calpin daili ga...,0
3,subject re issu fyi see note below alreadi don...,0
4,subject meter nov alloc fyi forward by lauri a...,0
5,subject mcmullen gas for jacki sinc the inlet ...,0
6,subject meter jan georg i need the follow done...,0
7,subject dun number chang fyi forward by gari l...,0
8,subject king ranch there are two field of gas ...,0
9,subject re entex transist thank so much for th...,0


In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


Xs = emails_df['message'].values
Ys = emails_df['class'].values


vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
Xs1 = vectorizer.fit_transform(Xs)


X_train, X_test, y_train, y_test = train_test_split(Xs1,Ys,test_size=0.2, shuffle=True, random_state=0, stratify=Ys)




In [66]:
feature_names = vectorizer.get_feature_names()



In [67]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

clf.fit(X_train, y_train)

pred = clf.predict(X_test)

print("Accuracy: {}".format(clf.score(X_test, y_test))) 

Accuracy: 0.9847271648873073


In [68]:
def get_most_important_features(vectorizer, classifier, n=None):
    feature_names = vectorizer.get_feature_names()
    top_features = sorted(zip(classifier.coef_[0], feature_names))[-n:]
    for coef, feat in top_features:
        print(coef, feat)

get_most_important_features(vectorizer, clf, 10)



-7.101930406379714 money
-7.081060622911973 price
-7.0772488202912855 onlin
-7.076960633122006 offer
-7.064397823813551 www
-7.046302424656827 softwar
-6.975680916541189 email
-6.941400855243032 click
-6.658365805870353 com
-6.590683424968377 http


In [69]:
email = ["Hello George, how about a game of tennis tomorrow?",
         "Hello, click here if you want to satisfy your wife tonight",
         "We offer free viagra!!! Click here now!!!",
         "Dear Sara, I prepared the annual report. Please check the attachment.",
         "Hi David, will we go for cinema tonight?",
         "Best holidays offers only here!!!"]

In [70]:
examples = vectorizer.transform(email)
predictions = clf.predict(examples)
predictions

array([0, 1, 1, 0, 0, 1])

In [71]:
emails_df

Unnamed: 0,message,class
0,subject christma tree farm pictur,0
1,subject vastar resourc inc gari product from t...,0
2,subject calpin daili gas nomin calpin daili ga...,0
3,subject re issu fyi see note below alreadi don...,0
4,subject meter nov alloc fyi forward by lauri a...,0
...,...,...
33711,subject iso q good news c edaliss val edumm vl...,1
33712,subject all prescript medicin are on special t...,1
33713,subject the next generat onlin pharmaci are yo...,1
33714,subject bloow in time the time learn how to la...,1


In [None]:
len(Xs)

33716

In [72]:
for i in range(0,10):
  print(Xs[i])

subject christma tree farm pictur 
subject vastar resourc inc gari product from the high island larger block a commenc on saturday at p m at about gross carlo expect between and gross for tomorrow vastar own of the gross product georg x forward by georg weissman hou ect on am daren j farmer am to carlo j rodriguez hou ect ect cc georg weissman hou ect ect melissa grave hou ect ect subject vastar resourc inc carlo pleas call linda and get everyth set up i m go to estim come up tomorrow with a increas each follow day base on my convers with bill fischer at bmar d forward by daren j farmer hou ect on am enron north america corp from georg weissman am to daren j farmer hou ect ect cc gari bryan hou ect ect melissa grave hou ect ect subject vastar resourc inc darren the attach appear to be a nomin from vastar resourc inc for the high island larger block a previous erron refer to as the well vastar now expect the well to commenc product sometim tomorrow i told linda harri that we d get her a

In [73]:
emails_df.shape

(33716, 2)

In [74]:
emails_df[emails_df['class']==0].shape

(16545, 2)

In [75]:
from sklearn.metrics import confusion_matrix

In [76]:
y_predict=clf.predict(X_test)
confusion_matrix(y_predict,y_test)

array([[3260,   54],
       [  49, 3381]])

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_predict,y_test)

0.9849963583394027

In [None]:
from sklearn.metrics import recall_score

In [None]:
recall_score(y_predict,y_test)

0.9857142857142858

In [None]:
emails_df['message'][1]

'subject vastar resourc inc gari product from the high island larger block a commenc on saturday at p m at about gross carlo expect between and gross for tomorrow vastar own of the gross product georg x forward by georg weissman hou ect on am daren j farmer am to carlo j rodriguez hou ect ect cc georg weissman hou ect ect melissa grave hou ect ect subject vastar resourc inc carlo pleas call linda and get everyth set up i m go to estim come up tomorrow with a increas each follow day base on my convers with bill fischer at bmar d forward by daren j farmer hou ect on am enron north america corp from georg weissman am to daren j farmer hou ect ect cc gari bryan hou ect ect melissa grave hou ect ect subject vastar resourc inc darren the attach appear to be a nomin from vastar resourc inc for the high island larger block a previous erron refer to as the well vastar now expect the well to commenc product sometim tomorrow i told linda harri that we d get her a telephon number in gas control so

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
dictionary=open('/content/drive/MyDrive/MainEnglishDictionary.txt','r')

In [20]:
dictdata = dictionary.read()

In [21]:
emails_df.head()

Unnamed: 0,message,class
0,subject christma tree farm pictur,0
1,subject vastar resourc inc gari product from t...,0
2,subject calpin daili gas nomin calpin daili ga...,0
3,subject re issu fyi see note below alreadi don...,0
4,subject meter nov alloc fyi forward by lauri a...,0


In [22]:
emails_dict_df=emails_df.copy()

In [23]:
emails_dict_df.append({'message':'Alphabet','class':1},ignore_index=True)

Unnamed: 0,message,class
0,subject christma tree farm pictur,0
1,subject vastar resourc inc gari product from t...,0
2,subject calpin daili gas nomin calpin daili ga...,0
3,subject re issu fyi see note below alreadi don...,0
4,subject meter nov alloc fyi forward by lauri a...,0
...,...,...
33712,subject all prescript medicin are on special t...,1
33713,subject the next generat onlin pharmaci are yo...,1
33714,subject bloow in time the time learn how to la...,1
33715,subject dear sir i am interest in it hi do you...,1


In [24]:
for word in dictdata.split(' '):
  emails_dict_df.append({'message':word,'class':1},ignore_index=True)

In [30]:
emails_dict_df.to_pickle("/content/drive/MyDrive/Pickles/df_dict.pkl")

In [31]:
em_dict=pd.read_pickle("/content/drive/MyDrive/Pickles/df_dict.pkl")

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
Xs = emails_df['message'].values
Ys = emails_df['class'].values


vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
Xs1 = vectorizer.fit_transform(Xs)


X_train, X_test, y_train, y_test = train_test_split(Xs1,Ys,test_size=0.2, shuffle=True, random_state=0, stratify=Ys)

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

clf.fit(X_train, y_train)

pred = clf.predict(X_test)

print("Accuracy: {}".format(clf.score(X_test, y_test)))

Accuracy: 0.5847271648873073


In [46]:
emails_df['message'][5]

'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank '

In [None]:
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet  in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas  to river plant is shut in on  last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at  can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullenplant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen  is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki  last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen  last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)
emails_dict_df.append({'message':'subject mcmullen meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank ','class':1},ignore_index=True)


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
Xs = emails_df['message'].values
Ys = emails_df['class'].values


vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
Xs1 = vectorizer.fit_transform(Xs)


X_train, X_test, y_train, y_test = train_test_split(Xs1,Ys,test_size=0.2, shuffle=True, random_state=0, stratify=Ys)

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

clf.fit(X_train, y_train)

pred = clf.predict(X_test)

print("Accuracy: {}".format(clf.score(X_test, y_test)))

Accuracy: 0.9847271648873073


In [50]:
email = ["subject mcmullen gas for jacki sinc the inlet to river plant is shut in on the last day of flow at what meter is the mcmullen gas be divert to at what meter is hpl buy the residu gas this is the gas from teco vastar vintag tejon and swift i still see activ deal at meter in path manag for teco vastar vintag tejon and swift i also see gas schedul in pop at meter and pleas advic we need to resolv this as soon as possibl so settlement can send out payment thank "]

In [52]:
examples = vectorizer.transform(email)
predictions = clf.predict(examples)
predictions


array[1]


In [57]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [83]:
class NaiveBayesClassifier_WITH_RONI():
    '''
    Bayes Theorem form
    P(y|X) = P(X|y) * P(y) / P(X)
    '''
    def calc_prior(self, features, target):
        '''
        prior probability P(y)
        calculate prior probabilities
        '''
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()

        return self.prior
    
    def calc_statistics(self, features, target):
        '''
        calculate mean, variance for each column and convert to numpy array
        ''' 
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def gaussian_density(self, class_idx, x):     
        '''
        calculate probability from gaussian density function (normally distributed)
        we will assume that probability of specific target value given specific class is normally distributed 
        
        probability density function derived from wikipedia:
        (1/√2pi*σ) * exp((-1/2)*((x-μ)^2)/(2*σ²)), where μ is mean, σ² is variance, σ is quare root of variance (standard deviation)
        '''
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
#         numerator = np.exp(-((x-mean)**2 / (2 * var)))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for i in range(self.count):
            prior = np.log(self.prior[i]) ## use the log to make it more numerically stable
            conditional = np.sum(np.log(self.gaussian_density(i, x))) # use the log to make it more numerically stable
            posterior = prior + conditional
            posteriors.append(posterior)
        # return class with highest posterior probability
        return self.classes[np.argmax(posteriors)]
     

    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy

    def visualize(self, y_true, y_pred, target):
        
        tr = pd.DataFrame(data=y_true, columns=[target])
        pr = pd.DataFrame(data=y_pred, columns=[target])
        
        
        fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))
        
        sns.countplot(x=target, data=tr, ax=ax[0], palette='viridis', alpha=0.7, hue=target, dodge=False)
        sns.countplot(x=target, data=pr, ax=ax[1], palette='viridis', alpha=0.7, hue=target, dodge=False)
        

        fig.suptitle('True vs Predicted Comparison', fontsize=20)

        ax[0].tick_params(labelsize=12)
        ax[1].tick_params(labelsize=12)
        ax[0].set_title("True values", fontsize=18)
        ax[1].set_title("Predicted values", fontsize=18)
        plt.show()

In [60]:

df = pd.read_pickle("/content/drive/MyDrive/Pickles/df_dict.pkl")
# shuffle dataset with sample
df = df.sample(frac=1, random_state=1).reset_index(drop=True)
# df shape
print(df.shape)

X = df['message'].values
y = df['class'].values

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
X = vectorizer.fit_transform(X)

X, y = df.iloc[:, :-1], df.iloc[:, -1]


# # split on train and test 0.7/0.3
X_train, X_test, y_train, y_test = X[:100], X[100:], y[:100], y[100:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(33716, 2)
(100, 1) (100,)
(33616, 1) (33616,)


In [61]:
x = NaiveBayesClassifier_WITH_RONI()


x.fit(X_train, y_train)

In [62]:
predictions = x.predict(X_test)

In [64]:
x.accuracy(y_test, predictions)

0.9089838172298905


In [79]:
y_predict=clf.predict(X_test)
confusion_matrix(y_predict,y_test)

array([[3200,   54],[  49, 3498]])


In [None]:
class NaiveBayesClassifier_WITH_Thresholds():
    threshold_spam=0.3
    threshold_ham=0.7
    def calc_prior(self, features, target):
        '''
        prior probability P(y)
        calculate prior probabilities
        '''
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()

        return self.prior
    
    def calc_statistics(self, features, target):
        '''
        calculate mean, variance for each column and convert to numpy array
        ''' 
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def gaussian_density(self, class_idx, x):     
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
        threshold_ham = np.exp(-((x-mean)**2 / (2 * var)))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []

        for i in range(self.count):
            if(threshold_ham>0.4):
              prior = np.log(self.prior[i]) ## use the log to make it more numerically stable
              conditional = np.sum(np.log(self.gaussian_density(i, x))) # use the log to make it more numerically stable
              posterior = prior + conditional
              posteriors.append(posterior)
          # return class with highest posterior probability
        return self.classes[np.argmax(posteriors)]
     

    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy

    def visualize(self, y_true, y_pred, target):
        
        tr = pd.DataFrame(data=y_true, columns=[target])
        pr = pd.DataFrame(data=y_pred, columns=[target])
        
        
        fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))
        
        sns.countplot(x=target, data=tr, ax=ax[0], palette='viridis', alpha=0.7, hue=target, dodge=False)
        sns.countplot(x=target, data=pr, ax=ax[1], palette='viridis', alpha=0.7, hue=target, dodge=False)
        

        fig.suptitle('True vs Predicted Comparison', fontsize=20)

        ax[0].tick_params(labelsize=12)
        ax[1].tick_params(labelsize=12)
        ax[0].set_title("True values", fontsize=18)
        ax[1].set_title("Predicted values", fontsize=18)
        plt.show()

In [None]:
x = NaiveBayesClassifier_WITH_Thresholds()


x.fit(X_train, y_train)

In [None]:
predictions = x.predict(X_test)

In [86]:
x.accuracy(y_test, predictions)

0.8690936732567


In [85]:
y_predict=clf.predict(X_test)
confusion_matrix(y_predict,y_test)

array([[3000,   60],[  68, 3808]])
