# Coding a spam classifier with naive Bayes

### 1. Imports and pre-processing data

We load the data into a Pandas DataFrame, and then preprocess it by adding a string with the (non-repeated) words in the email.

In [26]:
import numpy as np

In [27]:
import pandas as pd
emails = pd.read_csv('./emails.csv')

In [28]:
emails[:10]

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [29]:
def process_email(text):
    text = text.lower()
    return list(set(text.split()))

emails['words'] = emails['text'].apply(process_email)

In [30]:
emails[:10]

Unnamed: 0,text,spam,words
0,Subject: naturally irresistible your corporate...,1,"[hotat, :, website, full, havinq, products, gu..."
1,Subject: the stock trading gunslinger fanny i...,1,"[edt, waterway, inflexible, the, hepburn, pers..."
2,Subject: unbelievable new homes made easy im ...,1,"[72, $, take, foward, hearing, the, a, that, w..."
3,Subject: 4 color printing special request add...,1,"[request, 4, canyon, a, :, advertisement, form..."
4,"Subject: do not have money , get software cds ...",1,"[the, money, finish, not, !, by, tradgedies, a..."
5,"Subject: great nnews hello , welcome to medzo...",1,"[hlpplng, helter, the, allusion, a, total, day..."
6,Subject: here ' s a hot play in motion homela...,1,"[ensuring, stated, chemica, :, vita, economy, ..."
7,Subject: save your money buy getting this thin...,1,"[exactiy, viagra, the, minutes, money, a, that..."
8,Subject: undeliverable : home based business f...,1,"[:, kpn, reach, fjt, based, subject:, com, s, ..."
9,Subject: save your money buy getting this thin...,1,"[exactiy, viagra, the, minutes, money, a, that..."


In [31]:
# Calculating the probability that an email is spam
1.0*sum(emails['spam']==1)/len(emails)

0.2388268156424581

### 3. Training a naive Bayes model

Our plan is to write a dictionary, and in this dictionary record every word, and its pair of occurrences in spam and ham

In [32]:
model = {}

# Training process
for index, email in emails.iterrows():
    for word in email['words']:
        if word not in model:
            model[word] = {'spam': 1, 'ham': 1}
        if word in model:
            if email['spam']:
                model[word]['spam'] += 1
            else:
                model[word]['ham'] += 1

In [33]:
model

{'hotat': {'spam': 24, 'ham': 1},
 ':': {'spam': 798, 'ham': 3639},
 'website': {'spam': 204, 'ham': 135},
 'full': {'spam': 117, 'ham': 248},
 'havinq': {'spam': 10, 'ham': 1},
 'products': {'spam': 103, 'ham': 203},
 'guaranteed': {'spam': 96, 'ham': 15},
 '_': {'spam': 190, 'ham': 512},
 'shouldn': {'spam': 28, 'ham': 9},
 'naturally': {'spam': 9, 'ham': 8},
 'lt': {'spam': 28, 'ham': 1},
 'irresistible': {'spam': 5, 'ham': 1},
 'make': {'spam': 276, 'ham': 711},
 'budget': {'spam': 34, 'ham': 55},
 'stylish': {'spam': 15, 'ham': 1},
 'no': {'spam': 395, 'ham': 624},
 'at': {'spam': 450, 'ham': 2364},
 'subject:': {'spam': 1369, 'ham': 4361},
 'this': {'spam': 681, 'ham': 2672},
 'creativeness': {'spam': 25, 'ham': 1},
 'marketing': {'spam': 148, 'ham': 99},
 'will': {'spam': 484, 'ham': 2459},
 'stationery': {'spam': 112, 'ham': 1},
 'provided': {'spam': 109, 'ham': 113},
 'management': {'spam': 78, 'ham': 695},
 'break': {'spam': 30, 'ham': 71},
 'surethat': {'spam': 24, 'ham': 1}

In [34]:
model['lottery']

{'spam': 9, 'ham': 1}

In [35]:
model['sale']

{'spam': 39, 'ham': 42}

In [36]:
def predict_bayes(word):
    word = word.lower()
    num_spam_with_word = model[word]['spam']
    num_ham_with_word = model[word]['ham']
    return 1.0*num_spam_with_word/(num_spam_with_word + num_ham_with_word)

In [37]:
predict_bayes('lottery')

0.9

In [38]:
predict_bayes('sale')

0.48148148148148145

In [39]:
def predict_naive_bayes(email):
    email = email.lower()
    words = set(email.split())
    spams = []
    hams = []
    for word in words:
        if word in model:
            spams.append(model[word]['spam'])
            hams.append(model[word]['ham'])
    prod_spams = np.long(np.prod(spams))
    prod_hams = np.long(np.prod(hams))
    return 1.0*prod_spams/(prod_spams + prod_hams)

In [40]:
predict_naive_bayes('hi mom how are you')

0.0013894756610580057

In [41]:
predict_naive_bayes('Hi MOM how aRe yoU')

0.0013894756610580057

In [42]:
predict_naive_bayes('enter the lottery to win three million dollars')

0.38569290647197135

In [43]:
predict_naive_bayes('meet me at the lobby of the hotel at nine am')

0.02490194297492509

In [44]:
predict_naive_bayes('buy cheap lottery easy money now')

0.9913514898646872