### Naïve Bayes spam classifier

In [50]:
# my imports:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [26]:
spam_mail_df = pd.read_csv('data/mail_data.csv', names = ['category', 'message'], skiprows = 1)

spam_train = spam_mail_df.head(4458)
spam_test = spam_mail_df.tail(1114)


In [27]:
# Paretto principle
print(f'{len(spam_mail_df) * 0.8:.0f}')
print(f'{len(spam_mail_df) * 0.2:.0f}')

4458
1114


In [28]:
spam_train

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
4453,ham,I've told you everything will stop. Just dont ...
4454,ham,Or I guess &lt;#&gt; min
4455,ham,I'm home. Ard wat time will u reach?
4456,ham,"Storming msg: Wen u lift d phne, u say ""HELLO""..."


In [29]:
spam_test

Unnamed: 0,category,message
4458,ham,Aight should I just plan to come up later toni...
4459,ham,Die... I accidentally deleted e msg i suppose ...
4460,spam,Welcome to UK-mobile-date this msg is FREE giv...
4461,ham,This is wishing you a great day. Moji told me ...
4462,ham,Thanks again for your reply today. When is ur ...
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [30]:
spam_test.message.sample()

4881    alright tyler's got a minor crisis and has to ...
Name: message, dtype: object

In [31]:
spam_train.category.value_counts()

category
ham     3856
spam     602
Name: count, dtype: int64

In [40]:
print(spam_train.loc[0].message)
print(spam_train.loc[0].category)

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham


In [44]:
Counter(spam_train.loc[0].message.lower().split(' '))

Counter({'go': 1,
         'until': 1,
         'jurong': 1,
         'point,': 1,
         'crazy..': 1,
         'available': 1,
         'only': 1,
         'in': 1,
         'bugis': 1,
         'n': 1,
         'great': 1,
         'world': 1,
         'la': 1,
         'e': 1,
         'buffet...': 1,
         'cine': 1,
         'there': 1,
         'got': 1,
         'amore': 1,
         'wat...': 1})

In [46]:
counter = CountVectorizer(strip_accents = 'unicode')

In [47]:
counter.fit(spam_train.message)

In [48]:
counter.vocabulary_

{'go': 3166,
 'until': 7204,
 'jurong': 3889,
 'point': 5306,
 'crazy': 2050,
 'available': 1140,
 'only': 4964,
 'in': 3647,
 'bugis': 1547,
 'great': 3238,
 'world': 7624,
 'la': 4003,
 'buffet': 1545,
 'cine': 1805,
 'there': 6850,
 'got': 3205,
 'amore': 927,
 'wat': 7416,
 'ok': 4934,
 'lar': 4034,
 'joking': 3858,
 'wif': 7531,
 'oni': 4960,
 'free': 2987,
 'entry': 2616,
 'wkly': 7586,
 'comp': 1907,
 'to': 6956,
 'win': 7544,
 'fa': 2742,
 'cup': 2104,
 'final': 2852,
 'tkts': 6944,
 '21st': 355,
 'may': 4414,
 '2005': 347,
 'text': 6805,
 '87121': 679,
 'receive': 5644,
 'question': 5549,
 'std': 6486,
 'txt': 7116,
 'rate': 5597,
 'apply': 1008,
 '08452810075over18': 68,
 'dun': 2479,
 'say': 5945,
 'so': 6301,
 'early': 2497,
 'hor': 3494,
 'already': 901,
 'then': 6845,
 'nah': 4692,
 'don': 2394,
 'think': 6865,
 'he': 3369,
 'goes': 3173,
 'usf': 7245,
 'lives': 4174,
 'around': 1054,
 'here': 3409,
 'though': 6883,
 'freemsg': 2993,
 'hey': 3419,
 'darling': 2156,
 'it':

In [53]:
word_counts = counter.transform(spam_train.message)

In [59]:
word_counts_test = counter.transform(spam_test.message)

In [55]:
model = MultinomialNB()

In [57]:
model.fit(word_counts, spam_train.category)

In [58]:
model.score(word_counts, spam_train.category)

0.9934948407357559

In [60]:
model.score(word_counts_test, spam_test.category)

0.9856373429084381

In [61]:
test_sample = spam_test.sample(10)

In [69]:
predictions = model.predict(counter.transform(test_sample.message))
predictions

array(['ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'ham'], dtype='<U4')

In [68]:
sample = np.array(test_sample.category)
sample

array(['ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam',
       'ham'], dtype=object)

In [67]:
predictions == sample

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True])

In [71]:
(predictions == sample).sum() / len(sample)

0.9