# Classification of messages as spam or not spam using Naive Bayes algorithm

**Import Dataset - upload the SMS text file to the content folder on the left panel before running**

In [139]:
import pandas as pd
import numpy as np

# Import Dataset - upload the SMS text file to the content folder on the left panel before running
df = pd.read_table('SMS.txt', sep='\t', header=None, names=['label', 'sms_message'])
df

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [140]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['label_binary'] = df.label.map({'ham':0,'spam':1})
df.head()

Unnamed: 0,label,sms_message,label_binary
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [44]:
# Get stats
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [141]:
#  data cleaning
df['sms_message'] = df['sms_message'].str.replace(r'[\W_]+', ' ', regex=True).str.strip() # Removes punctuation and leading/trailing spaces
df['sms_message'] = df['sms_message'].str.lower() ### making all the words lowercase
df.head(10)

Unnamed: 0,label,sms_message,label_binary
0,ham,go until jurong point crazy available only in ...,0
1,ham,ok lar joking wif u oni,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,u dun say so early hor u c already then say,0
4,ham,nah i don t think he goes to usf he lives arou...,0
5,spam,freemsg hey there darling it s been 3 week s n...,1
6,ham,even my brother is not like to speak with me t...,0
7,ham,as per your request melle melle oru minnaminun...,0
8,spam,winner as a valued network customer you have b...,1
9,spam,had your mobile 11 months or more u r entitled...,1


In [46]:
# Randomly shuffle the records in the dataset to avoid bias
df = df.sample(frac=1, random_state=1)
df.head(10)

Unnamed: 0,label,sms_message,label_binary
1078,ham,yep by the pretty sculpture,0
4028,ham,yes princess are you going to make me moan,0
958,ham,welp apparently he retired,0
4642,ham,havent,0
4674,ham,i forgot 2 ask ü all smth there s a card on da...,0
5461,ham,ok i thk i got it then u wan me 2 come now or wat,0
4210,ham,i want kfc its tuesday only buy 2 meals only 2...,0
4216,ham,no dear i was sleeping p,0
1603,ham,ok pa nothing problem,0
1504,ham,ill be there on lt gt ok,0


In [47]:
# Split into training and test sets
training_test_index = round(len(df) * 0.8)

training = df[:training_test_index].reset_index(drop=True)
test = df[training_test_index:].reset_index(drop=True)

print('-- Training set stats --')
print(training.shape)
print(training['label_binary'].value_counts())
print('-- Test set stats --')
print(test.shape)
print(test['label_binary'].value_counts())

-- Training set stats --
(4458, 3)
label_binary
0    3858
1     600
Name: count, dtype: int64
-- Test set stats --
(1114, 3)
label_binary
0    967
1    147
Name: count, dtype: int64


In [48]:
### creating vocabulary from training data
training['sms_message'] = training['sms_message'].str.split()
vocabulary = []
for sms in training['sms_message']:
   for word in sms:
      vocabulary.append(word)
vocabulary = list(set(vocabulary))  ### only count the number of unique words
print(len(vocabulary))
vocabulary[0:9]

7780


['statement',
 'smith',
 'jungle',
 'violence',
 'itz',
 'rd',
 '01223585334',
 'margin',
 '1680']

In [49]:
word_counts_per_sms = {unique_word: [0] * len(training['sms_message']) for unique_word in vocabulary}

for index, sms in enumerate(training['sms_message']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts

Unnamed: 0,statement,smith,jungle,violence,itz,rd,01223585334,margin,1680,pounded,...,pobox75ldns7,citylink,favor,tenants,box95qu,lololo,test,stalk,orchard,3ss
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
training_new = pd.concat([training, word_counts], axis=1)
training_new.head()

Unnamed: 0,label,sms_message,label_binary,statement,smith,jungle,violence,itz,rd,01223585334,...,pobox75ldns7,citylink,favor,tenants,box95qu,lololo,test,stalk,orchard,3ss
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
# Run a baseline model evaluation
# Set all 'predicted to 0 or 1 randomly to get a baseline (coin-flip)
test['predicted'] = np.random.randint(0, 2, size=len(test))
test['predicted'].value_counts()

Unnamed: 0_level_0,count
predicted,Unnamed: 1_level_1
0,557
1,557


In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(test['label_binary'], test['predicted'])))
print('Precision score: {}'.format(precision_score(test['label_binary'], test['predicted'])))
print('Recall score: {}'.format(recall_score(test['label_binary'], test['predicted'])))
print('F1 score: {}'.format(f1_score(test['label_binary'], test['predicted'])))

Accuracy score: 0.5008976660682226
Precision score: 0.13285457809694792
Recall score: 0.5034013605442177
F1 score: 0.21022727272727273


## **Your implementation starts here**.  Make sure your prediction result is saved into the column `test['predicted']` for the evaludation to run automatically.  
**50 points** for successful execution of your code and producing the confusion matrix correctly

In [53]:
# Laplace smoothing
alpha = 1

In [None]:
# Hints:
# Step 1: you need to caculate P(Spam) and P(Ham)
# Step 2: you need to count N_Spam, N_Ham
# Step 3: you need to count the number of times the word w occurs in spam/ham message: N_w_spam, N_w_ham
# Step 4: then you can calculate the prob of occurance of each word:
#         p(w|spam)=(N_w_spam+alpha)/(N_Spam+alpha*N_Vocabulary)
#         p(w|Ham)=(N_w_ham+alpha)/(N_Ham+alpha*N_Vocabulary)
# Step 5: Now perform the prediction on the test dataset messages using the Naiive Bayes method. Store your prediction results (1=spam or 0=ham ) to test['predicted']
# Step 6: Summarize the results in a confusion matrix and print out the four values of the confusion matrix
#         Verify that your printout is consistent with the output from test['label_binary'].value_counts() and test['predicted'].value_counts()

# Your code goes here

In [54]:
# calculate p(ham) p(spam)
p_ham = training_new['label_binary'].value_counts()[0] / len(training_new)
p_spam = training_new['label_binary'].value_counts()[1] / len(training_new)
print('P(Ham): ', p_ham)
print('P(Spam): ', p_spam)

P(Ham):  0.8654104979811574
P(Spam):  0.13458950201884254


In [55]:
# get values counts for ham and spam
n_ham = training_new['label_binary'].value_counts()[0]
n_spam = training_new['label_binary'].value_counts()[1]
print('N(Ham): ', n_ham)
print('N(Spam): ', n_spam)

N(Ham):  3858
N(Spam):  600


In [56]:
# number of occurances for each word
def N_w_spam(word):
  return training_new[training_new['label_binary'] == 1][word].sum()

def N_w_ham(word):
  return training_new[training_new['label_binary'] == 0][word].sum()

In [57]:
# probability of each word
def p_w_spam(word):
  return (N_w_spam(word) + alpha) / (n_spam + alpha * len(vocabulary))

def p_w_ham(word):
  return (N_w_ham(word) + alpha) / (n_ham + alpha * len(vocabulary))

In [61]:
# use prediction on the dataset
for index, row in test.iterrows():
  message = row['sms_message'].split()
  for word in message:
    val_spam = 1
    val_ham = 1
    if word not in vocabulary:
      continue
    elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0 and (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
      continue
    elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0:
      val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
    elif (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
      val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
    else:
      val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
      val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
  val_spam *= p_spam
  #print(val_spam)
  val_ham *= p_ham
  #print(val_ham)
  #print()
  if val_spam > val_ham:
    test.loc[index, 'predicted'] = 1
  else:
    test.loc[index, 'predicted'] = 0

**Evaluate your implementation** for accuracy, precision, recall and F1_score.  The performance points of your implementation will be calculated automatically.  However, it is only awarded if the predictions are made by a Naive Bayes implementation.

**30 points** for how well your implementation predicts spam.  A correct implementation should achieve an F1 score above 0.90.  
## **DO NOT modify this cell below.**

In [62]:
# Model Evaluation
print('Accuracy score: {}'.format(accuracy_score(test['label_binary'], test['predicted'])))
print('Precision score: {}'.format(precision_score(test['label_binary'], test['predicted'])))
print('Recall score: {}'.format(recall_score(test['label_binary'], test['predicted'])))
my_f1_score = f1_score(test['label_binary'], test['predicted'])
print('F1 score: {}'.format(my_f1_score))
performance_point = round(np.clip((my_f1_score - 0.20) / (0.9-0.20) * 30, 0, 30))
print('Your perforamnce point: {}'.format(performance_point))

Accuracy score: 0.90754039497307
Precision score: 0.94
Recall score: 0.3197278911564626
F1 score: 0.47715736040609136
Your perforamnce point: 12


**Analyze your implementation of the Naive Bayes algorithm:** select an entry from each quadrant of the confusion matrix and show the details of the prediction, i.e., the probability of being a spam or a ham, and all the contributing probabilities.  Discuss why mis-classification ocurrs for the FP and FN examples.

**20 points** for a correct and clear presentation.

In [126]:
# your code goes here
# TP
row = test.loc[(test['label_binary'] == 1) & (test['predicted'] == 1)]
row = pd.DataFrame(row)
message = row.iloc[0]['sms_message'].split()
print('TRUE POSITIVE:')
print(" -Individual Word Weights: ")
for word in message:
  val_spam = 1
  val_ham = 1
  if word not in vocabulary:
    continue
  elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0 and (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
    continue
  elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0:
    val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
    print(' - ', word, " (ham): ", (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)))
  elif (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
    val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
    print(' - ', word, " (spam): ", (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)))
  else:
    val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
    print(' - ', word, " (spam): ", (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)))
    val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
    print(' - ', word, " (ham): ", (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)))
val_spam *= p_spam
print(" P(spam): ", val_spam)
val_ham *= p_ham
print(" P(ham): ", val_ham)
print("Choice -> spam")


 -Individual Word Weights: 
 -  dear  (spam):  0.0013126491646778043
 -  dear  (ham):  0.007475511256229593
 -  dave  (spam):  0.00023866348448687351
 -  dave  (ham):  8.592541673827119e-05
 -  this  (spam):  0.00847255369928401
 -  this  (ham):  0.017700635848083863
 -  is  (spam):  0.016348448687350834
 -  is  (ham):  0.04983674170819728
 -  your  (spam):  0.025298329355608593
 -  your  (ham):  0.029214641691012202
 -  final  (spam):  0.0011933174224343676
 -  final  (ham):  0.00025777625021481353
 -  notice  (spam):  0.00035799522673031026
 -  notice  (ham):  0.00034370166695308474
 -  to  (spam):  0.0652744630071599
 -  to  (ham):  0.10946898092455748
 -  collect  (spam):  0.002386634844868735
 -  collect  (ham):  0.00025777625021481353
 -  your  (spam):  0.025298329355608593
 -  your  (ham):  0.029214641691012202
 -  4  (spam):  0.012529832935560859
 -  4  (ham):  0.012716961677264134
 -  tenerife  (spam):  0.00047732696897374703
 -  tenerife  (ham):  8.592541673827119e-05
 -  hol

In [127]:
# TN
row = test.loc[(test['label_binary'] == 0) & (test['predicted'] == 0)]
row = pd.DataFrame(row)
message = row.iloc[0]['sms_message'].split()
print('TRUE NEGATIVE:')
print(" -Individual Word Weights: ")
for word in message:
  val_spam = 1
  val_ham = 1
  if word not in vocabulary:
    continue
  elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0 and (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
    continue
  elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0:
    val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
    print(' - ', word, " (ham): ", (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)))
  elif (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
    val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
    print(' - ', word, " (spam): ", (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)))
  else:
    val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
    print(' - ', word, " (spam): ", (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)))
    val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
    print(' - ', word, " (ham): ", (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)))
val_spam *= p_spam
print(" P(spam): ", val_spam)
val_ham *= p_ham
print(" P(ham): ", val_ham)
print("Choice -> ham")

TRUE NEGATIVE:
 -Individual Word Weights: 
 -  later  (spam):  0.00011933174224343676
 -  later  (ham):  0.008506616257088847
 -  i  (spam):  0.006085918854415275
 -  i  (ham):  0.20639285100532737
 -  guess  (spam):  0.0008353221957040572
 -  guess  (ham):  0.0018044337515036948
 -  i  (spam):  0.006085918854415275
 -  i  (ham):  0.20639285100532737
 -  do  (spam):  0.002863961813842482
 -  do  (ham):  0.027152431689293693
 -  study  (spam):  0.00011933174224343676
 -  study  (ham):  0.0006014779171678983
 -  too  (spam):  0.00023866348448687351
 -  too  (ham):  0.007819212923182677
 P(spam):  3.2121599527170055e-05
 P(ham):  0.006766828949672222
Choice -> ham


In [128]:
# FP
row = test.loc[(test['label_binary'] == 0) & (test['predicted'] == 1)]
row = pd.DataFrame(row)
message = row.iloc[0]['sms_message'].split()
print('FALSE POSITIVE:')
print(" -Individual Word Weights: ")
for word in message:
  val_spam = 1
  val_ham = 1
  if word not in vocabulary:
    continue
  elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0 and (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
    continue
  elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0:
    val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
    print(' - ', word, " (ham): ", (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)))
  elif (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
    val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
    print(' - ', word, " (spam): ", (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)))
  else:
    val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
    print(' - ', word, " (spam): ", (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)))
    val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
    print(' - ', word, " (ham): ", (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)))
val_spam *= p_spam

print(" P(spam): ", val_spam)
val_ham *= p_ham
print(" P(ham): ", val_ham)
print("Choice -> spam")

FALSE POSITIVE:
 -Individual Word Weights: 
 -  i  (spam):  0.006085918854415275
 -  i  (ham):  0.20639285100532737
 -  liked  (spam):  0.00011933174224343676
 -  liked  (ham):  0.0004296270836913559
 -  the  (spam):  0.018854415274463007
 -  the  (ham):  0.07913730881594776
 -  new  (spam):  0.007398568019093079
 -  new  (ham):  0.004811823337343186
 -  mobile  (spam):  0.011694510739856803
 -  mobile  (ham):  0.0012029558343357965
 P(spam):  0.001573958376831333
 P(ham):  0.0010410506076418804
Choice -> spam


In [129]:
# FN
row = test.loc[(test['label_binary'] == 1) & (test['predicted'] == 0)]
row = pd.DataFrame(row)
message = row.iloc[0]['sms_message'].split()
print('FALSE NEGATIVE:')
print(" -Individual Word Weights: ")
for word in message:
  val_spam = 1
  val_ham = 1
  if word not in vocabulary:
    continue
  elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0 and (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
    continue
  elif (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)) == 0:
    val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
    print(' - ', word, " (ham): ", (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)))
  elif (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)) == 0:
    val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
    print(' - ', word, " (spam): ", (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)))
  else:
    val_spam *= (N_w_spam(word) + alpha) / (n_spam + len(vocabulary))
    print(' - ', word, " (spam): ", (N_w_spam(word) + alpha) / (n_spam + len(vocabulary)))
    val_ham *= (N_w_ham(word) + alpha) / (n_ham + len(vocabulary))
    print(' - ', word, " (ham): ", (N_w_ham(word) + alpha) / (n_ham + len(vocabulary)))
val_spam *= p_spam
print(" P(spam): ", val_spam)
val_ham *= p_ham
print(" P(ham): ", val_ham)
print(" Choice -> ham")

FALSE NEGATIVE:
 -Individual Word Weights: 
 -  had  (spam):  0.0013126491646778043
 -  had  (ham):  0.006444406255370338
 -  your  (spam):  0.025298329355608593
 -  your  (ham):  0.029214641691012202
 -  mobile  (spam):  0.011694510739856803
 -  mobile  (ham):  0.0012029558343357965
 -  10  (spam):  0.0026252983293556086
 -  10  (ham):  0.0010311050008592541
 -  mths  (spam):  0.00035799522673031026
 -  mths  (ham):  0.00017185083347654237
 -  update  (spam):  0.0013126491646778043
 -  update  (ham):  0.0004296270836913559
 -  to  (spam):  0.0652744630071599
 -  to  (ham):  0.10946898092455748
 -  latest  (spam):  0.003341288782816229
 -  latest  (ham):  0.00025777625021481353
 -  orange  (spam):  0.0027446300715990453
 -  orange  (ham):  0.0004296270836913559
 -  camera  (spam):  0.002863961813842482
 -  camera  (ham):  0.00025777625021481353
 -  video  (spam):  0.003460620525059666
 -  video  (ham):  0.00025777625021481353
 -  phones  (spam):  0.0009546539379474941
 -  phones  (ham)

For the False Negative and False Positive examples, either there are multiple uses of the same word in the message or the probability for spam vs ham is very close.  Also because the vocabulary is trained only on the training set, there is a lot of missing values in the testing set so this could be a reason for the poor F1 score.

**Your discussion goes here --**

**Bonus for 20 points:**  Use function MultinomialNB (from sklearn.naive_bayes import MultinomialNB) to perform the same classification and evaludate its results.

In [130]:
# your code goes here
from sklearn.naive_bayes import MultinomialNB

In [142]:
### creating vocabulary from ALL data
df['sms_message'] = df['sms_message'].str.split()
vocabulary_full = []
for sms in df['sms_message']:
   for word in sms:
      vocabulary_full.append(word)
vocabulary_full = list(set(vocabulary_full))  ### only count the number of unique words
print(len(vocabulary_full))
vocabulary_full[0:9]

# edit test values
word_counts_per_sms = {unique_word: [0] * len(test['sms_message']) for unique_word in vocabulary_full}

for index, sms in enumerate(test['sms_message']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
test_new = pd.concat([test, word_counts], axis=1)
test_new.head()

# edit training values
word_counts_per_sms = {unique_word: [0] * len(training['sms_message']) for unique_word in vocabulary_full}

for index, sms in enumerate(training['sms_message']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
training_new = pd.concat([training, word_counts], axis=1)
training_new.head()


8750


Unnamed: 0,label,sms_message,label_binary,statement,smith,jungle,violence,strokes,itz,rd,...,citylink,tenants,favor,box95qu,lololo,consistently,test,stalk,orchard,3ss
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [147]:
X_train = training_new.drop(['label', 'label_binary', 'sms_message'], axis=1)
y_train = training_new['label_binary']
X_test = test_new.drop(['label', 'label_binary', 'sms_message', 'predicted'], axis=1)
y_test = test_new['label_binary']

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [148]:
# Model Evaluation
print('Accuracy score: {}'.format(accuracy_score(y_test, y_pred)))
print('Precision score: {}'.format(precision_score(y_test, y_pred)))
print('Recall score: {}'.format(recall_score(y_test, y_pred)))
my_f1_score = f1_score(y_test, y_pred)
print('F1 score: {}'.format(my_f1_score))
performance_point = round(np.clip((my_f1_score - 0.20) / (0.9-0.20) * 30, 0, 30))
print('Your perforamnce point: {}'.format(performance_point))

Accuracy score: 0.9829443447037702
Precision score: 0.9102564102564102
Recall score: 0.9659863945578231
F1 score: 0.9372937293729373
Your perforamnce point: 30


This model performed much better than my own, I made usre to include the entire vocabulary from both the training and testing set.  This might have helped a lot with the F1 score.