In [204]:
import numpy as np
import turicreate as tc
np.random.seed(0)

In [205]:
emails = tc.SFrame('./emails.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [206]:
emails

text,spam
Subject: naturally irresistible your ...,1
Subject: the stock trading gunslinger f ...,1
Subject: unbelievable new homes made easy im ...,1
Subject: 4 color printing special request ...,1
"Subject: do not have money , get software cds ...",1
"Subject: great nnews hello , welcome to ...",1
Subject: here ' s a hot play in motion homeland ...,1
Subject: save your money buy getting this thing ...,1
Subject: undeliverable : home based business for ...,1
Subject: save your money buy getting this thing ...,1


In [207]:
emails['words'] = tc.text_analytics.count_words(emails['text']).apply(lambda x:x.keys())

In [208]:
emails

text,spam,words
Subject: naturally irresistible your ...,1,"[interested, portfolio, this, result, extra, no, ..."
Subject: the stock trading gunslinger f ...,1,"[albeit, diffusion, optima, attire, namea ..."
Subject: unbelievable new homes made easy im ...,1,"[pittman, foward, form, 1, the, website, visit, ..."
Subject: 4 color printing special request ...,1,"[and, advertisement, is, fax, 91706, 626, ..."
"Subject: do not have money , get software cds ...",1,"[death, by, d, finish, are, be, to, yet, from, ..."
"Subject: great nnews hello , welcome to ...",1,"[day, devitalize, have, in, customers, 5, er, ..."
Subject: here ' s a hot play in motion homeland ...,1,"[constitutes, accuracy, websites, sources, ..."
Subject: save your money buy getting this thing ...,1,"[get, aicohol, with, mix, minutes, just, right, ..."
Subject: undeliverable : home based business for ...,1,"[unknown, 6, co, 7059, msexch, 5, 8, i, 4, 000, ..."
Subject: save your money buy getting this thing ...,1,"[get, with, mix, minutes, just, right, start, ..."


In [209]:
print("Total number of emails:", len(emails))
print("Total number of spam emails:", sum(emails['spam']))
print()
print("Prior probability that an email is spam:", sum(emails['spam'])/len(emails))

Total number of emails: 5728
Total number of spam emails: 1368

Prior probability that an email is spam: 0.2388268156424581


In [210]:
model = {}
words = 0

for email in emails:
    for word in email['words']:
        is_spam = email['spam']
        if word not in model: #If the word is not in the model
            model[word] = {'spam':1, 'ham':1}
        else: #If the word is in the model
            if is_spam: #If the email is spam
                model[word]['spam'] += 1
            else: #If the email is ham
                model[word]['ham'] += 1

In [211]:
print('lottery:', model['lottery'])
print('sale:', model['sale'])
print('buy:', model['buy'])
print('hello:', model['hello'])

lottery: {'spam': 8, 'ham': 1}
sale: {'spam': 38, 'ham': 42}
buy: {'spam': 119, 'ham': 132}
hello: {'spam': 139, 'ham': 265}


In [212]:
def predict_bayes(word):
    num_spam_with_word = model[word]['spam']
    num_ham_with_word = model[word]['ham']
    return 1.0*num_spam_with_word/(num_spam_with_word + num_ham_with_word)

In [213]:
predict_bayes('lottery')

0.8888888888888888

In [214]:
predict_bayes('sale')

0.475

In [215]:
predict_bayes('buy')

0.47410358565737054

In [216]:
predict_bayes('hello')

0.34405940594059403

In [217]:
def predict_naive_bayes(email):
    words = set(email.split())
    spams = []
    hams = []
    for word in words:
        if word in model:
            spams.append(model[word]['spam'])
            hams.append(model[word]['ham'])
    prod_spams = np.long(np.prod(spams))
    prod_hams = np.long(np.prod(hams))
    return prod_spams/(prod_spams + prod_hams)

In [218]:
predict_naive_bayes('hello mom how are you')

0.004409778009986974

In [219]:
predict_naive_bayes('buy cheap lottery easy money now')

0.9898455158916786

# In Sklearn

In [220]:
from sklearn.naive_bayes import GaussianNB

In [221]:
emails

text,spam,words
Subject: naturally irresistible your ...,1,"[interested, portfolio, this, result, extra, no, ..."
Subject: the stock trading gunslinger f ...,1,"[albeit, diffusion, optima, attire, namea ..."
Subject: unbelievable new homes made easy im ...,1,"[pittman, foward, form, 1, the, website, visit, ..."
Subject: 4 color printing special request ...,1,"[and, advertisement, is, fax, 91706, 626, ..."
"Subject: do not have money , get software cds ...",1,"[death, by, d, finish, are, be, to, yet, from, ..."
"Subject: great nnews hello , welcome to ...",1,"[day, devitalize, have, in, customers, 5, er, ..."
Subject: here ' s a hot play in motion homeland ...,1,"[constitutes, accuracy, websites, sources, ..."
Subject: save your money buy getting this thing ...,1,"[get, aicohol, with, mix, minutes, just, right, ..."
Subject: undeliverable : home based business for ...,1,"[unknown, 6, co, 7059, msexch, 5, 8, i, 4, 000, ..."
Subject: save your money buy getting this thing ...,1,"[get, with, mix, minutes, just, right, start, ..."


In [223]:
# Finding the 100 most popular words in the dataset
words_dict = {}
for email in emails:
    for word in email['words']:
        if word in words_dict.keys():
            words_dict[word] += 1
        else:
            words_dict[word] = 1

word_tuples = sorted(words_dict.items(), key=lambda item: item[1], reverse=True)[:200]
popular_words = [t[0] for t in word_tuples]

popular_words[:10]

['subject', 'to', 'the', 'and', 'you', 'a', 'for', 'of', 'in', 'is']

In [None]:
for word in popular_words:
    emails[word] = emails.apply(lambda x: word in x['words'])

In [None]:
emails