# Spam Filter with Naive Bayes

... some description ...

## Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
%config InlineBackend.figure_format = 'retina'
sns.set_style('darkgrid')

In [3]:
msg = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

## Dataset

### distribution non-spam / spam

In [4]:
msg.Label.value_counts(normalize=True) * 100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

### word count

In [5]:
msg.SMS.apply(lambda x: len(x.split(' '))).describe()

count    5572.000000
mean       15.709440
std        11.493753
min         1.000000
25%         7.000000
50%        12.000000
75%        23.000000
max       171.000000
Name: SMS, dtype: float64

### Splitting Training / Testing Data

In [6]:
msg_rand = msg.sample(frac=1, random_state=1)
bd_80 = round(len(msg_rand) * 0.8)

msg_training = msg_rand.iloc[:bd_80].reset_index(drop=True)
msg_testing = msg_rand.iloc[bd_80:].reset_index(drop=True)

print('\nTraining data ({} rows)'.format(len(msg_training)))
print(msg_training.Label.value_counts(normalize=True))
print('\nTesting data ({} rows)'.format(len(msg_testing)))
print(msg_testing.Label.value_counts(normalize=True))


Training data (4458 rows)
ham     0.86541
spam    0.13459
Name: Label, dtype: float64

Testing data (1114 rows)
ham     0.868043
spam    0.131957
Name: Label, dtype: float64


## Data Cleaning and Preparation

In [7]:
msg_training['SMS'] = msg_training['SMS']\
    .str.replace('\W', ' ')\
    .str.lower()\
    .str.split()
msg_training['SMS'].head()

  msg_training['SMS'] = msg_training['SMS']\


0                    [yep, by, the, pretty, sculpture]
1    [yes, princess, are, you, going, to, make, me,...
2                      [welp, apparently, he, retired]
3                                             [havent]
4    [i, forgot, 2, ask, ü, all, smth, there, s, a,...
Name: SMS, dtype: object

### Vocabulary

In [8]:
voc_set = set()
for words in list(msg_training.SMS):
    voc_set.update(words)
vocabulary = list(voc_set)
print('Words in Vocabulary:', len(vocabulary))

Words in Vocabulary: 7783


### Generate word count table

In [9]:
word_counts_per_sms = {unique_word: [0] * len(msg_training['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(msg_training['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [10]:
word_counts = pd.DataFrame(word_counts_per_sms)

In [11]:
msg_training_wc = pd.concat([
    word_counts,
    msg_training
])

In [12]:
word_counts.shape

(4458, 7783)

## Spam Filter

### multinomial Naive Beyes

In [13]:
p_spam = msg_training.Label.value_counts(normalize=True)['ham']
p_ham = 1 - p_spam
spam = msg_training.Label == 'spam'
ham = msg_training.Label == 'ham'
N_spam = msg_training[spam]['SMS'].apply(len).sum()
N_ham = msg_training[ham]['SMS'].apply(len).sum()
N_vocabulary = len(vocabulary)
alpha = 1

print('P(Spam):', p_spam)
print('P(Ham):', p_ham)
print('N_Spam:', N_spam)
print('N_Ham:', N_ham)
print('N_Vocabulary:', N_vocabulary)
print('Alpha:', alpha)

P(Spam): 0.8654104979811574
P(Ham): 0.13458950201884257
N_Spam: 15190
N_Ham: 57237
N_Vocabulary: 7783
Alpha: 1


In [14]:
init_voc_dict = { word:0 for word in vocabulary }
spam_parameters = init_voc_dict.copy()
ham_parameters = init_voc_dict.copy()

def smooth(n, d):
    return (n + alpha) / (d + alpha * N_vocabulary)

for word in vocabulary:
    wc = msg_training_wc[word]
    spam_parameters[word] = smooth(wc[spam].sum(), N_spam)
    ham_parameters[word] = smooth(wc[ham].sum(), N_ham)

### Classification

In [28]:
def classify(message):
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    cleaned_msg = re.sub(r'\W', ' ', message).lower()
    words = cleaned_msg.split()
    
    for word in words:
        if word in spam_parameters:
            # print(word, 'in spam')
            p_spam_given_message *= spam_parameters[word]
            
        if word in ham_parameters:
            # print(word, 'in ham')
            p_ham_given_message *= ham_parameters[word] 
            
    #print(p_spam_given_message)
    #print(p_ham_given_message)
    
    label = ('spam' if p_spam_given_message > p_ham_given_message
            else 'ham' if p_spam_given_message < p_ham_given_message
            else 'equal probability')
    return label

## Tests

In [29]:
msg_testing['predicted'] = msg_testing.SMS.apply(classify)
msg_testing.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


### Accuracy

In [30]:
correct = 0
total = len(msg_testing)

for idx, row in msg_testing.iterrows():
    if (row.Label == row.predicted):
        correct += 1

accuracy = correct / total

print('\nAccuracy:', accuracy)


Accuracy: 0.952423698384201


In [31]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

'spam'