In [177]:
import pandas as pd
from nltk import word_tokenize
import math
import operator
from random import shuffle

In [38]:
full_df = pd.read_table('spam.csv', encoding='latin-1', header=0, delimiter=',')

In [39]:
df = full_df[['v1','v2']]
df.columns = ['class', 'text']

In [40]:
print(df.head())

  class                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [64]:

sents = df.head()['text'].tolist()

words = [ word_tokenize(sent) for sent in sents ]
collapse = [ word.lower() for sent in words for word in sent ]

print(sents)
print(words)
print(collapse)
print(set(collapse))

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though"]
[['Go', 'until', 'jurong', 'point', ',', 'crazy..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...'], ['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...'], ['Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', '.', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 'T', '&', 'C', "'s", 'apply', '08452810075over18', "'s"], ['U', 'dun', 'say', 'so', 'early', 'hor', '...', 'U

In [178]:
def prep_data(df):
    
    spam_sents = df.loc[df['class'] == 'spam']['text'].tolist()
    ham_sents = df.loc[df['class'] == 'ham']['text'].tolist()
    shuffle(spam_sents)
    shuffle(ham_sents)
    
    spam_words = [ word_tokenize(sent) for sent in spam_sents ]
    spam_collapse = [ word.lower() for sent in spam_words for word in sent ]
    ham_words = [ word_tokenize(sent) for sent in ham_sents ]
    ham_collapse = [ word.lower() for sent in ham_words for word in sent ]
    
    labels = df['class'].tolist()
    sents = df['text'].tolist()
    
    words = spam_collapse + ham_collapse
    
    n = int(0.8*len(labels))
    
    return labels[:n], labels[n:], sents[:n], sents[n:], set(words), spam_collapse, ham_collapse

In [179]:
y_train, y_test, X_train, X_test, vocab, spam_words, ham_words = prep_data(df)

In [None]:
# prior probability * likelyhood

In [180]:
def priors(labels):
    return { cl: math.log(float(labels.count(cl)) / len(labels)) for cl in labels}

In [181]:
priors = priors(y_train)
print(priors)

{'ham': -0.1451048869491259, 'spam': -2.0019737276377345}


In [182]:
def likelihood(vocab, spam_words, ham_words):
    ALL = spam_words + ham_words
    mles = {'spam': {}, 'ham': {}}
    
    for word in vocab:
        mles['spam'][word] = math.log((spam_words.count(word) + 1) / float((ALL.count(word) + len(vocab))))
        mles['ham'][word] = math.log((ham_words.count(word) + 1) / float((ALL.count(word) + len(vocab))))
    
    return mles

In [183]:
mles = likelihood(vocab, spam_words, ham_words)

In [184]:
def NaiveBayes(priors, mles, sent):
    preds = {}
    for cl in mles.keys():
        pred = priors[cl]
        words = word_tokenize(sent)
        for word in words:
            if word in mles[cl].keys():
                pred += mles[cl][word]
        preds[cl] = pred
    return max(preds.items(), key=operator.itemgetter(1))[0]

In [144]:
NaiveBayes(priors, mles,)

'ham'

In [185]:
def counts(x, label):
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    for ind in range(len(x)):
        pred_l = NaiveBayes(priors, mles, x[ind])
        #print(sent)
        #print(pred_l)
        if pred_l == label:
            tp += 1
        else:
            fp += 1
    return tp, fp

spam = []
ham = []
for ind in range(len(X_test)):
    if y_test[ind] == 'spam':
        spam.append(X_test[ind])
    else:
        ham.append(X_test[ind])

sp_tp, sp_fp = counts(spam, 'spam')
sp_tn, sp_fn = counts(ham, 'ham')

#print(len(X_test))
# accuracy = float(tp)/len(y_test)

sp_precision = float(sp_tp)/(sp_tp+sp_fp)
sp_accuracy = float(sp_tp+sp_tn)/len(y_test)
sp_recall = float(sp_tp)/(sp_tp+sp_fn)
f1 = ((sp_precision * sp_recall)/(sp_precision + sp_recall)) * 2

print(f1)

0.6448598130841121
