# Coding a spam classifier with naive Bayes

### 1. Imports and pre-processing data

We load the data into a Turi Create SFrame, and then preprocess it by adding a string with the (non-repeated) words in the email.

In [1]:
import turicreate
import numpy as np

In [2]:
import pandas as pd
emails = pd.read_csv('./emails.csv')

In [3]:
#emails = turicreate.SFrame('./emails.csv')

In [5]:
emails[:10]

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [6]:
def process_email(text):
    return list(set(text.split()))

emails['words'] = emails['text'].apply(process_email)

In [7]:
#emails['word_count'] = turicreate.text_analytics.count_words(emails['text'])

In [9]:
emails[:10]

Unnamed: 0,text,spam,words
0,Subject: naturally irresistible your corporate...,1,"[all, through, portfolio, its, guaranteed, ,, ..."
1,Subject: the stock trading gunslinger fanny i...,1,"[and, merrill, is, nameable, clockwork, libret..."
2,Subject: unbelievable new homes made easy im ...,1,"[pre, and, all, show, being, visit, loan, 454,..."
3,Subject: 4 color printing special request add...,1,"[and, golden, 5110, 626, color, ca, an, canyon..."
4,"Subject: do not have money , get software cds ...",1,"[comedies, all, old, tradgedies, be, money, is..."
5,"Subject: great nnews hello , welcome to medzo...",1,"[va, groundsel, allusion, ag, tosher, confide,..."
6,Subject: here ' s a hot play in motion homela...,1,"[precise, all, chain, limited, indicating, ena..."
7,Subject: save your money buy getting this thin...,1,"[right, want, just, money, is, within, it, rea..."
8,Subject: undeliverable : home based business f...,1,"[unknown, grownups, co, telecom, is, mts, 000,..."
9,Subject: save your money buy getting this thin...,1,"[right, want, just, money, is, within, it, rea..."


In [64]:
# Calculating the probability 
1.0*sum(emails['spam']==1)/len(emails)

0.2388268156424581

### 2. Coding Naive Bayes

We start by counting how many spam and ham emails contain a given word.

We check for the words 'money' and 'easy'.

In [15]:
def count_spam_ham(word):
    email_count = {'spam': 0, 'ham': 0}
    for email in emails:
        if word in email['words']:
            if email['spam']:
                email_count['spam'] += 1
            else:
                email_count['ham'] += 1
    return email_count

# In case it's a dictionary
'''
def count_spam_ham(word):
    email_count = {'spam': 0, 'ham': 0}
    for email in emails:
        if word in email['word_count']:
            if email['spam']:
                email_count['spam'] += 1
            else:
                email_count['ham'] += 1
    return email_count
'''

"\ndef count_spam_ham(word):\n    email_count = {'spam': 0, 'ham': 0}\n    for email in emails:\n        if word in email['word_count']:\n            if email['spam']:\n                email_count['spam'] += 1\n            else:\n                email_count['ham'] += 1\n    return email_count\n"

In [16]:
print count_spam_ham('money')
print count_spam_ham('easy')

{'ham': 87, 'spam': 280}
{'ham': 61, 'spam': 110}


Now we make a function that takes a number of words. The naive Bayes algorithm goes over all these words, multiplies the probabilities that the email containing them are spam, and ham. Finally, calculates the weighted probabilities using Naive Bayes, and returns the probability that the email is spam.

In [17]:
def prob_spam_bayes(word):
    # Returns the probability that the email is spam given that it contains a word
    spam, ham = count_spam_ham(word)
    if spam==0 and ham==0:
        return 0.5
    return 1.0*spam/(spam+ham)

In [18]:
def prob_spam_naive_bayes(words):
    email_counts = [count_spam_ham(word) for word in words]
    spams = [count['spam'] for count in email_counts]
    hams = [count['ham'] for count in email_counts]
    #print spams
    #print hams
    spam = np.prod([count['spam'] for count in email_counts])
    ham = np.prod([count['ham'] for count in email_counts])
    if spam==0 and ham==0:
        return 0.5
    return 1.0*spam/(spam+ham)

# In case the email comes as a string
def prob_spam_naive_bayes_string(email):
    words = email.split()
    print words
    return prob_spam_naive_bayes(words)

### Testing with some sample emails
We verify that for non-spammy words, the classifier gives us small probabilities, and for spammy words it gives us large probabilities.

In [19]:
prob_spam_naive_bayes(['money', 'easy'])

0.8530201899908605

In [20]:
prob_spam_naive_bayes(['mom','friend','school'])

0.008857887217413228

In [21]:
prob_spam_naive_bayes(['prince','viagra'])

1.0

In [22]:
prob_spam_naive_bayes_string('hi mom how are you please buy apples')

['hi', 'mom', 'how', 'are', 'you', 'please', 'buy', 'apples']


0.0

In [23]:
prob_spam_naive_bayes_string('buy cheap viagra get lottery')

['buy', 'cheap', 'viagra', 'get', 'lottery']


1.0

In [24]:
prob_spam_naive_bayes_string('enter in the lottery now win three million dollars')

['enter', 'in', 'the', 'lottery', 'now', 'win', 'three', 'million', 'dollars']


1.0

In [25]:
prob_spam_naive_bayes_string('lets meet at the hotel lobby at nine am tomorrow')

['lets', 'meet', 'at', 'the', 'hotel', 'lobby', 'at', 'nine', 'am', 'tomorrow']


0.0

In [26]:
prob_spam_naive_bayes_string('hi mom make easy money')

['hi', 'mom', 'make', 'easy', 'money']


0.08279582746750283

In [27]:
prob_spam_naive_bayes_string('hi mom')

['hi', 'mom']


0.03860711582134747

In [28]:
prob_spam_naive_bayes_string('make easy money')

['make', 'easy', 'money']


0.6921082499793675

In [29]:
prob_spam_naive_bayes_string('subject')

['subject']


0.06958657388456815

In [30]:
prob_spam_naive_bayes_string('wadlidoo hi mom')

['wadlidoo', 'hi', 'mom']


0.5

### 3. Training an actual model (for efficiency)

Our plan is to write a dictionary, and in this dictionary record every word, and its pair of occurrences in spam and ham

In [31]:
model = {}

# Training process
for email in emails:
    for word in email['words']:
        if word not in model:
            model[word] = {'spam': 1, 'ham': 1}
        if word in model:
            if email['spam']:
                model[word]['spam'] += 1
            else:
                model[word]['ham'] += 1

In [66]:
model

{'50088': {'ham': 2, 'spam': 1},
 '017201846': {'ham': 1, 'spam': 2},
 'woods': {'ham': 4, 'spam': 2},
 'spiders': {'ham': 1, 'spam': 3},
 'hanging': {'ham': 9, 'spam': 2},
 'woody': {'ham': 5, 'spam': 1},
 'suzana': {'ham': 2, 'spam': 1},
 'localized': {'ham': 2, 'spam': 29},
 '5988': {'ham': 2, 'spam': 1},
 '5989': {'ham': 6, 'spam': 1},
 'hermans': {'ham': 2, 'spam': 1},
 '5982': {'ham': 2, 'spam': 1},
 '5984': {'ham': 4, 'spam': 1},
 'gaa': {'ham': 2, 'spam': 1},
 'hermann': {'ham': 3, 'spam': 1},
 'kimmorrell': {'ham': 3, 'spam': 1},
 'rawhide': {'ham': 1, 'spam': 2},
 'taj': {'ham': 2, 'spam': 1},
 'politician': {'ham': 1, 'spam': 2},
 'bringing': {'ham': 40, 'spam': 9},
 'liaisons': {'ham': 5, 'spam': 1},
 'grueling': {'ham': 1, 'spam': 2},
 'supportsenron': {'ham': 3, 'spam': 1},
 'wednesday': {'ham': 336, 'spam': 7},
 'cyberopps': {'ham': 1, 'spam': 2},
 'rebuilding': {'ham': 2, 'spam': 1},
 '0052': {'ham': 1, 'spam': 2},
 'fructiferous': {'ham': 1, 'spam': 2},
 '0057': {'ham'

In [69]:
model['lottery']

{'ham': 1, 'spam': 9}

In [70]:
model['sale']

{'ham': 42, 'spam': 39}

In [72]:
def predict_bayes(word):
    num_spam_with_word = model[word]['spam']
    num_ham_with_word = model[word]['ham']
    return 1.0*num_spam_with_word/(num_spam_with_word + num_ham_with_word)

In [73]:
predict_bayes('lottery')

0.9

In [74]:
predict_bayes('sale')

0.48148148148148145

In [92]:
def predict_naive_bayes(email):
    words = set(email.split())
    spams = []
    hams = []
    for word in words:
        if word in model:
            spams.append(model[word]['spam'])
            hams.append(model[word]['ham'])
    prod_spams = long(np.prod(spams))
    prod_hams = long(np.prod(hams))
    return 1.0*prod_spams/(prod_spams + prod_hams)

In [93]:
predict_naive_bayes('hi mom how are you')

0.0013894756610580057

In [94]:
predict_naive_bayes('enter the lottery to win three million dollars')

0.38569290647197135

In [95]:
predict_naive_bayes('meet me at the lobby of the hotel at nine am')

0.02490194297492509

In [96]:
predict_naive_bayes('buy cheap lottery easy money now')

0.9913514898646872