First, let's get the data cleaned up and binarized.  Not too hard to do this by hand. 

In [1]:
with open('spam.csv', encoding="latin1") as sc:
    slines = sc.readlines()

In [2]:
slines[1]

'ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,\n'

In [3]:
slines = [x.split(',', 1) for x in slines]

In [4]:
import string
# python 3 version from here: 
translator = str.maketrans('', '', string.punctuation)
slines = [[x[0], x[1].translate(translator)] for x in slines[1:]]

In [5]:
slines = [[x[0], x[1].replace("\n", "").lower()] for x in slines]

In [6]:
slines[1:3]

[['ham', 'ok lar joking wif u oni'],
 ['spam',
  'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s']]

In [7]:
from functools import reduce
dictionary = set(reduce(lambda x, y: x + y[1].split(" "), slines, []))

In [8]:
print(list(dictionary)[0:10])

['', 'del', 'corrupt', 'jamstercouk', 'peeps', 'cricket', 'les', 'okay', 'onlinewhy', 'occupy']


In [9]:
dictionary = [x for x in list(dictionary) if x]

In [10]:
dictionary[0:10]

['del',
 'corrupt',
 'jamstercouk',
 'peeps',
 'cricket',
 'les',
 'okay',
 'onlinewhy',
 'occupy',
 '09061213237']

In [11]:
len(dictionary)

9663

In [12]:
def make_binary_features(sline, d):
    feature = []
    feature.append({"spam": 1, "ham": 0}[sline[0]])
    words = sline[1].split(" ")
    for entry in d:
        if entry in words:
            feature.append(1)
        else:
            feature.append(0)
    return feature

In [13]:
slines = [[x[0].translate(translator), x[1]] for x in slines]
dataset = [make_binary_features(x, dictionary) for x in slines]

In [14]:
sum(dataset[0])

20

In [15]:
len(set(slines[0][1].split(" ")))

20

In [16]:
def idiot_check_counter(index):
    return sum(dataset[index]) == len(set(slines[index][1].split(" ")))

In [17]:
idiot_check_counter(20)


True

In [18]:
idiot_check_counter(100)

True

In [19]:
idiot_check_counter(200)

False

In [20]:
sum(dataset[200])

7

In [21]:
len(set(slines[200][1].split(" ")))

8

In [22]:
slines[200]

['ham', 'found it enc  ltgt  where you at']

In [23]:
##  is this an empty space problem? 
slines[200][1].split(" ")

['found', 'it', 'enc', '', 'ltgt', '', 'where', 'you', 'at']

In [24]:
# ok, that's fine.  add labels and we're solid.
headers = ["LABEL"] + dictionary

In [25]:
todf = [headers] + dataset

In [26]:
import pandas as pd
spamdf = pd.DataFrame(todf[1:],columns=todf[0])

In [27]:
spamdf.head()

Unnamed: 0,LABEL,del,corrupt,jamstercouk,peeps,cricket,les,okay,onlinewhy,occupy,...,organizer,erything,top,priscillas,intrude,th,fondly,seeno,luck,247mp
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
spamdf = spamdf.rename(columns = {'fit': 'fit_feature'})
# apparently slkearn can't handle a column being named "fit" https://stackoverflow.com/questions/39745807/typeerror-expected-sequence-or-array-like-got-estimator 

Ok, now that I know the problem behaves well with naive bayes (because of course it does), time to implement my own, first in python, then haskell after.

In [29]:
prob_y=spamdf["LABEL"].sum() / len(spamdf)

In [30]:
prob_y

0.13401506996770721

In [31]:
spams = spamdf[spamdf["LABEL"] == 1]

In [32]:
len(spams)

747

In [33]:
notspams = spamdf[spamdf["LABEL"] == 0]

In [34]:
len(list(spamdf))

9664

In [35]:
def conditional_probability_dict(column_label, condition_df):
    numerator = condition_df[column_label].sum() + 1
    denominator = len(condition_df) + 2
    return {column_label: numerator / denominator}

In [36]:
import toolz
x_probs_conditional_on_spam=[conditional_probability_dict(x, spams) for x in list(spams)[1:]]
x_spam_lookup = toolz.merge(x_probs_conditional_on_spam)

x_probs_conditional_on_notspam=[conditional_probability_dict(x, notspams) for x in list(notspams)[1:]]
x_notspam_lookup = toolz.merge(x_probs_conditional_on_notspam)


In [37]:
x_notspam_lookup["pleasure"]

0.0016566576931041624

In [38]:
x_spam_lookup["pleasure"]

0.0013351134846461949

In [39]:
x_notspam_lookup["hadya"]

0.00041416442327604059

In [40]:
x_spam_lookup["hadya"]

0.0013351134846461949

In [41]:
# SUBSET
from math import log # to keep the numbers from being too small.  summing rather than multiplying because of log transform: https://stats.stackexchange.com/questions/163088/how-to-use-log-probabilities-for-gaussian-naive-bayes  http://aritter.github.io/courses/5525_slides/probability_nb.pdf
features = list(spamdf)[1:]
def my_predict(row):
    prob_spam = log(prob_y)
    prob_notspam = 1 - log(prob_y)
    for feat in features:
        if row[feat] == 1:
            prob_spam = prob_spam + log(x_spam_lookup[feat])
            prob_notspam = prob_notspam + log(x_notspam_lookup[feat])
    if prob_spam >= prob_notspam:
        return 1
    else:
        return 0

In [42]:
my_preds = spamdf.head(20).apply(my_predict, axis = 1)

In [43]:
import cProfile

In [44]:
cProfile.run("spamdf.head(20).apply(my_predict, axis = 1)", "profileme")

In [45]:
my_preds

0     0
1     0
2     1
3     0
4     0
5     1
6     0
7     1
8     1
9     1
10    0
11    1
12    1
13    0
14    0
15    1
16    0
17    0
18    0
19    1
dtype: int64

In [46]:
spamdf.head(20)

Unnamed: 0,LABEL,del,corrupt,jamstercouk,peeps,cricket,les,okay,onlinewhy,occupy,...,organizer,erything,top,priscillas,intrude,th,fondly,seeno,luck,247mp
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
for idx, item in enumerate(my_preds):
    print(item, spamdf.head(20)["LABEL"][idx])

0 0
0 0
1 1
0 0
0 0
1 1
0 0
1 0
1 1
1 1
0 0
1 1
1 1
0 0
0 0
1 1
0 0
0 0
0 0
1 1
