# Building a Spam Filter with Naive Bayes

The purpose of this project is to differentiate between spam and non spam messages using various statistic techniques including the Naive Bayes Algorithm. I begin by exploring the data through pandas.

In [15]:
import pandas as pd

sms_spam_ds = pd.read_csv('SMSSpamCollection', sep='\t',header = None, names=['Label','SMS'])
print(sms_spam_ds.shape) #understand size of dataframe
sms_spam_ds.head(10) #idea of amount of spam versus ham mail

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [16]:
spam_vs_nonspam = sms_spam_ds["Label"].value_counts(normalize=True)
print(spam_vs_nonspam)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64


From the above code box, it is clear that there are more ham messages than spam messages percentage wise. In this section, I am splitting the data into a training and testing set. I split it 80-20 between training vs. testing.

In [17]:
#frac to randomize dataset & random state to reproduce results
randomized_ds = sms_spam_ds.sample(frac=1, random_state=1) 

#splitting the dataset 80-20 for training versus testing set
training_index = round(len(randomized_ds) * 0.8)

#Splitting the data into the training and testing datasets
training_ds = randomized_ds[:training_index].reset_index(drop=True)
test_ds = randomized_ds[training_index:].reset_index(drop=True)

#printing the shape to make sure proper 80-20 split
print(training_ds.shape)
print(test_ds.shape)

(4458, 2)
(1114, 2)


In order to check that the break down in the testing and training set is equivalent to that of all the data, I calculated the normalized percentages of these 2 datasets.

In [18]:
#checking to see if percentage of spam and ham messages in training & testing are similar to the percentages in full data set
training_ds["Label"].value_counts(normalize=True) * 100
test_ds["Label"].value_counts(normalize=True) * 100
#output shows similiar percentage breakdown

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

Below, I begin cleaning the data by removing punctuation and lower case letters in order to prepare to apply the Naive Bayes Theorem.

In [19]:
#checking to make sure dataset is outputting accurately
training_ds.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [20]:
#Removing punctuation from the SMS messages
training_ds['SMS'] = training_ds['SMS'].str.replace('\W', ' ')
#Lowercase letters
training_ds['SMS'] = training_ds['SMS'].str.lower()
#Review changes
training_ds.head(5)

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In this section, I am creating a unique list of all the words in the messages in the training set.

In [21]:
training_ds['SMS'] = training_ds['SMS'].str.split()

vocab = []
for sms in training_ds['SMS']:
    for word in sms:
        vocab.append(word)
        
vocab = list(set(vocab))

In [22]:
len(vocab)

7783

In this section, I begin to create a specific dictionary for the training set.

In [23]:
word_counts_per_sms = {unique_word: [0] * len(training_ds['SMS']) for unique_word in vocab}

for index, sms in enumerate(training_ds['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [None]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,0,00,000,000pes,008704050406,0089,01223585334,02,0207,02072069400,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [None]:
training_set_clean = pd.concat([training_ds, word_counts], axis=1)
training_set_clean.head()

In order to utilize the Naive Bayes Algorithm, I first need to calculate the probability of spam and ham in order to apply the formula. 

In [None]:
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocab)

# Laplace smoothing
alpha = 1

In [None]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocab}
parameters_ham = {unique_word:0 for unique_word in vocab}

In [None]:
# Calculate parameters
for word in vocab:
    n_word_given_spam = spam_messages[word].sum()  
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_messages[word].sum() 
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    parameters_ham[word] = p_word_given_ham

In [None]:
import re

def classify(message):
    
    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
            
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities! Must be classificed by hand')

In the cells below, I am testing whether the classify function works as expected.

In [None]:

classify('WINNER!! This is the secret code to unlock the money: C3421.')

In [None]:
classify("Sounds good, Tom, then see u there")

In [None]:
def classify_test_set(message):    
    '''
    message: a string
    '''
    
    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [None]:
test_ds['predicted'] = test_ds['SMS'].apply(classify_test_set)
test_ds.head()

Calculating the number of correctly and incorrectly labeled messages.

In [None]:
correct = 0
total = test_ds.shape[0]
    
for row in test_ds.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1
        
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

From the previous cells output, it is evident that majority of the messages were marked as their category correctly. For a future project, I plan to investigate the messages that were grouped incorrectly.