# Classification of messages as spam or not spam using Naive Bayes algorithm 

In [1]:
import pandas as pd

In [2]:
# Import Dataset
df = pd.read_table('SMS', sep='\t', header=None,names=['label', 'sms_message'])
df

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['is_spam'] = df.label.map({'ham':0,'spam':1})

In [4]:
#  data cleaning 
df['sms_message'] = df['sms_message'].str.replace('\W', ' ') # Removes punctuation
df['sms_message'] = df['sms_message'].str.lower() ### making all the words lowercase
df

  


Unnamed: 0,label,sms_message,is_spam
0,ham,go until jurong point crazy available only ...,0
1,ham,ok lar joking wif u oni,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,u dun say so early hor u c already then say,0
4,ham,nah i don t think he goes to usf he lives aro...,0
...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,1
5568,ham,will ü b going to esplanade fr home,0
5569,ham,pity was in mood for that so any other s...,0
5570,ham,the guy did some bitching but i acted like i d...,0


In [5]:
### creating vocabulary from training data
df['sms_message'] = df['sms_message'].str.split()
vocabulary = []
for sms in df['sms_message']:
   for word in sms:
      vocabulary.append(word)
vocabulary = list(set(vocabulary))  ### only count the number of unique words
print(len(vocabulary))

8753


In [6]:
# each word in each sms_message will be an x-value
word_counts_per_sms = {unique_word: [0] * len(df['sms_message']) for unique_word in vocabulary}

for index, sms in enumerate(df['sms_message']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts

Unnamed: 0,gotany,adults,joy,ideal,childporn,bugis,wow,carry,thedailydraw,onwards,...,inshah,simple,boo,woozles,soc,0825,broadband,text,torch,oh
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_new = pd.concat([df, word_counts], axis=1) #combine data frames
df_new.head()

Unnamed: 0,label,sms_message,is_spam,gotany,adults,joy,ideal,childporn,bugis,wow,...,inshah,simple,boo,woozles,soc,0825,broadband,text,torch,oh
0,ham,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,ham,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Randomize the dataset
df_new = df_new.sample(frac=1, random_state=1)
df_new

Unnamed: 0,label,sms_message,is_spam,gotany,adults,joy,ideal,childporn,bugis,wow,...,inshah,simple,boo,woozles,soc,0825,broadband,text,torch,oh
1078,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4028,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
958,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4642,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4674,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,ham,"[we, re, all, getting, worried, over, here, de...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5192,ham,"[oh, oh, den, muz, change, plan, liao, go, bac...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3980,ham,"[ceri, u, rebel, sweet, dreamz, me, little, bu...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
235,spam,"[text, meet, someone, sexy, today, u, can, fin...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
# Split into training and test sets
training_test_index = round(len(df_new) * 0.8)
training = df_new[:training_test_index].reset_index(drop=True)
test = df_new[training_test_index:].reset_index(drop=True)

# print(training.shape)
# print(test.shape)

In [10]:
print(training.shape)
print(test.shape)

(4458, 8756)
(1114, 8756)


In [11]:
# Laplace smoothing
alpha = 1

In [12]:
training.head()

Unnamed: 0,label,sms_message,is_spam,gotany,adults,joy,ideal,childporn,bugis,wow,...,inshah,simple,boo,woozles,soc,0825,broadband,text,torch,oh
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
test.head()

Unnamed: 0,label,sms_message,is_spam,gotany,adults,joy,ideal,childporn,bugis,wow,...,inshah,simple,boo,woozles,soc,0825,broadband,text,torch,oh
0,ham,"[later, i, guess, i, needa, do, mcat, study, too]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[but, i, haf, enuff, space, got, like, 4, mb]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[had, your, mobile, 10, mths, update, to, late...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,ham,"[all, sounds, good, fingers, makes, it, diffic...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[all, done, all, handed, in, don, t, know, if,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# You will start from here.

In [14]:
# Hints:
# Step 1: caculate P(Spam) and P(Ham)
# Step 2: count N_Spam, N_Ham, N_Vocabulary
# Step 3: count the number of times the word w occurs in spam/ham message: N_w_spam
# Step 4: p(w|spam)=(N_w_spam+alpha)/(N_Spam+alpha*N_Vocabulary)
#         p(w|Ham)=(N_w_ham+alpha)/(N_Ham+alpha*N_Vocabulary)

In [15]:
# Step 1: caculate P(Spam) and P(Ham)
# isolate spam and ham messages 
spam_messages = training[training['label'] == 'spam']
ham_messages = training[training['label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training)
p_ham = len(ham_messages) / len(training)

# print P(Spam) and P(Ham)
print(p_spam)
print(p_ham)

0.13458950201884254
0.8654104979811574


In [16]:
# Step 2: count N_Spam, N_Ham, N_Vocabulary

# N_Spam
n_wpm_spam = spam_messages['sms_message'].apply(len) #number of words per spam message
N_Spam = n_wpm_spam.sum() # N_Spam = number of words in all the spam messages

# N_Ham
n_wpm_ham = ham_messages['sms_message'].apply(len) #number of words per ham message
N_Ham = n_wpm_ham.sum() # N_Ham = number of words in all the ham messages

# N_Vocabulary
N_Vocabulary = len(vocabulary) # N_Vocabulary = number of vocab words in all messages

print(N_Spam)
print(N_Ham)
print(N_Vocabulary)

15190
57237
8753


In [17]:
# Step 3: count the number of times the word w occurs in spam/ham message: N_w_spam
# Step 4: p(w|spam)=(N_w_spam+alpha)/(N_Spam+alpha*N_Vocabulary)
#         p(w|Ham)=(N_w_ham+alpha)/(N_Ham+alpha*N_Vocabulary)

# P(w|Spam) and P(w|Ham)
# P(w|Spam) = (P(Spam|w)*P(w))/(P(Spam))
# P(w|Ham) = (P(Ham|w)*P(w))/(P(Ham))

# Initiate parameters
parameters_spam = {indiv_word: 0 for indiv_word in vocabulary}
parameters_ham = {indiv_word: 0 for indiv_word in vocabulary}

# Calculate parameters
for word in vocabulary:

   # p(w|spam)=(N_w_spam + alpha) / (N_Spam + alpha * N_Vocabulary)
   N_w_spam = spam_messages[word].sum() # number of words given it is a spam messgae
   P_w_spam = (N_w_spam + alpha) / (N_Spam + alpha * N_Vocabulary)
   parameters_spam[word] = P_w_spam # prob word is in a spam message

   # p(w|Ham)=(N_w_ham + alpha) / (N_Ham + alpha * N_Vocabulary)
   N_w_ham = ham_messages[word].sum() # number of words given it is a ham messgae
   P_w_ham = (N_w_ham + alpha) / (N_Ham + alpha * N_Vocabulary)
   parameters_ham[word] = P_w_ham # prob word is in a ham message

# Calculate accuracy, precision, recall and F1_score. 

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [19]:
def classify_test(message): # message will be the sms_message (list of words)

   p_spam_given_message = p_spam # prob that message is spam given message (initialize to overall prob being spam)
   p_ham_given_message = p_ham # prob that message is ham given message (initialize to overall prob being ham)

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word] # multiplying prob of being spam given the message to each word that is classified as spam

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word] # multiplying prob of being ham given the message to each word that is classified as ham
   
   if p_ham_given_message >= p_spam_given_message:
      return 0
   elif p_spam_given_message > p_ham_given_message:
      return 1

In [20]:
test['predicted'] = test['sms_message'].apply(classify_test) # adding predicted column to test data frame using classify_test function
test.head()

Unnamed: 0,label,sms_message,is_spam,gotany,adults,joy,ideal,childporn,bugis,wow,...,simple,boo,woozles,soc,0825,broadband,text,torch,oh,predicted
0,ham,"[later, i, guess, i, needa, do, mcat, study, too]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[but, i, haf, enuff, space, got, like, 4, mb]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[had, your, mobile, 10, mths, update, to, late...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,ham,"[all, sounds, good, fingers, makes, it, diffic...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[all, done, all, handed, in, don, t, know, if,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Model Evaluation 

print('Accuracy score: {}'.format(accuracy_score(test['is_spam'], test['predicted'])))
print('Precision score: {}'.format(precision_score(test['is_spam'], test['predicted'])))
print('Recall score: {}'.format(recall_score(test['is_spam'], test['predicted'])))
print('F1 score: {}'.format(f1_score(test['is_spam'], test['predicted'])))

Accuracy score: 0.9829443447037702
Precision score: 0.9102564102564102
Recall score: 0.9659863945578231
F1 score: 0.9372937293729372


# Use function MultinomialNB to validate your results, print the classification performance.

In [22]:
training.head()

Unnamed: 0,label,sms_message,is_spam,gotany,adults,joy,ideal,childporn,bugis,wow,...,inshah,simple,boo,woozles,soc,0825,broadband,text,torch,oh
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
test.head()

Unnamed: 0,label,sms_message,is_spam,gotany,adults,joy,ideal,childporn,bugis,wow,...,simple,boo,woozles,soc,0825,broadband,text,torch,oh,predicted
0,ham,"[later, i, guess, i, needa, do, mcat, study, too]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[but, i, haf, enuff, space, got, like, 4, mb]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[had, your, mobile, 10, mths, update, to, late...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,ham,"[all, sounds, good, fingers, makes, it, diffic...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[all, done, all, handed, in, don, t, know, if,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# X_train ar predictors
# creating X_train for training set x-vals
X_train = training.iloc[: , -8753:] 
X_train.head()

Unnamed: 0,gotany,adults,joy,ideal,childporn,bugis,wow,carry,thedailydraw,onwards,...,inshah,simple,boo,woozles,soc,0825,broadband,text,torch,oh
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# X_test ar predictors
# creating X_test for test set x-vals
X_test = test.iloc[: , -8754:]
del X_test["predicted"]
X_test.head()

Unnamed: 0,gotany,adults,joy,ideal,childporn,bugis,wow,carry,thedailydraw,onwards,...,inshah,simple,boo,woozles,soc,0825,broadband,text,torch,oh
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# y_train and y_test are outcomes
# creating y_train and y_test for training and testing on is_spam
y_train = training["is_spam"]
y_test = test["is_spam"]

In [31]:
# Use function MultinomialNB (from sklearn.naive_bayes import MultinomialNB) to validate your results, 
    # print the classification performance.

#create model
mnb = MultinomialNB()

#fit model (on training set)
mnb.fit(X_train, y_train)

# predictions
# grab predictions for the test set because we want to know how well our model did, 
    # not on the data it was trained on, but on data it has never seen before
y_pred = mnb.predict(X_test)

In [32]:
# Model Evaluation 

print('Accuracy score: {}'.format(accuracy_score(y_test, y_pred)))
print('Precision score: {}'.format(precision_score(y_test, y_pred)))
print('Recall score: {}'.format(recall_score(y_test, y_pred)))
print('F1 score: {}'.format(f1_score(y_test, y_pred)))

# predictions are about the same with training with the MultinomialNB model

Accuracy score: 0.9829443447037702
Precision score: 0.9102564102564102
Recall score: 0.9659863945578231
F1 score: 0.9372937293729372
