In [1]:
import numpy as np
import pandas as pd
import string
from collections import Counter
import plotly.graph_objs as go ;from plotly import tools ;
from plotly.offline import iplot,init_notebook_mode
import matplotlib.pyplot as plt
init_notebook_mode(connected=True)

# Read Data

In [2]:
df=pd.read_table("smsspamcollection/SMSSpamCollection",names=["Target","Message"])
df['Message']=df['Message'].str.decode("ascii","ignore")
df['Message']=df['Message'].str.encode("ascii")

df.head(5)

Unnamed: 0,Target,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Split Data to Train and Test

In [3]:
train=df.sample(frac=0.8,random_state=0)
test=df.drop(train.index)

# Pre-Process the message, lower case and punctuation removal

In [4]:
def process_message(message):
    return [x.lower().translate( None,string.punctuation) for x in message.split()] #Return list of processed words

# Construct Bag of Words that counts occurances of each word

In [5]:
word_bag={}
word_bag['Spam']=Counter(reduce(lambda x,y: x+y,train[train['Target']=='spam']['Message'].apply(process_message)))
word_bag['Ham']=Counter(reduce(lambda x,y: x+y,train[train['Target']!='spam']['Message'].apply(process_message)))


# Calculate the probability P(xi|Y)

In [6]:
def word_prob(word,target,alpha):
    prob_word = word_bag[target][word] if word in word_bag[target] else 0
    
    return (float)(prob_word + alpha)/(len(word_bag[target])+20000*alpha)


# Classifiy the message

In [7]:
def classifiy(message,alpha):
    
    message = process_message(message) #process message
    
    Total=(float)(len(word_bag['Spam'])+len(word_bag['Ham'])) #total number of words in bag
    
    #P(w1)*P(w2)*P(w3)...P(w.n)
    Normalizer=np.product([(word_bag['Spam'][word]+word_bag['Ham'][word])/Total for word in message]) 
    
    #P(x1|Y=spam)*P(x2|Y=spam)*P(x3|Y=spam)...P(xn|Y=spam)
    Product_SpamWords_Probs=np.product([word_prob(word,'Spam',alpha) for word in message])
    
    #P(x1|Y=ham)*P(x2|Y=ham)*P(x3|Y=ham)...P(xn|Y=ham)
    Product_HamWords_Probs=np.product([word_prob(word,'Ham',alpha) for word in message])
    
    #P(Y=spam)
    Spam_Prob = len(word_bag['Spam'])/Total
    
    #P(Y=ham)
    Ham_Prob = 1 - Spam_Prob
    
    #P(M=spam) = ( (x1|Y=spam)*P(x2|Y=spam)*P(x3|Y=spam)...P(xn|Y=spam) * P(Y=spam) ) / P(w1)*P(w2)*P(w3)...P(w.n)
    message_spam_prob = (Product_SpamWords_Probs * Spam_Prob)/Normalizer
    
    #P(M=ham) = ( P(x1|Y=ham)*P(x2|Y=ham)*P(x3|Y=ham)...P(xn|Y=ham) * #P(Y=ham) ) / P(w1)*P(w2)*P(w3)...P(w.n)
    message_ham_prob = (Product_HamWords_Probs * Ham_Prob)/Normalizer
    
    return "spam" if message_spam_prob > message_ham_prob else "ham" #Return the class of the greater probability 

# Classifier Performance Measures

In [8]:
def classifier_performance(data):
    
    measure={}
    Positives = data[data['Predict']=='spam']
    Negatives = data[data['Predict']!='spam']
    
    measure['True_Positives'] = (Positives['Target'] == Positives['Predict']).sum()
    measure['False_Positives'] = (Positives['Target'] != Positives['Predict']).sum()
    measure['True_Negatives'] = (Negatives['Target'] == Negatives['Predict']).sum()
    measure['False_Negatives'] = (Negatives['Target'] != Negatives['Predict']).sum()
    
    measure['Precision'] = float(measure['True_Positives'])/(measure['True_Positives']+measure['False_Positives'])
    measure['Recall'] = float(measure['True_Positives'])/(measure['True_Positives']+measure['False_Negatives']) 
    measure['F_Score'] = 2 * (measure['Precision']*measure['Recall']/(measure['Precision'] + measure['Recall']))
    measure['Accuracy'] = (data['Target']==data['Predict']).sum()/float(len(data))
    
    return measure

# Predict the test data using our Naive Bayes classifier

In [9]:
test['Predict']=test['Message'].apply(lambda x: classifiy(x,0.1))
test.head(10)

Unnamed: 0,Target,Message,Predict
0,ham,"Go until jurong point, crazy.. Available only ...",ham
7,ham,As per your request 'Melle Melle (Oru Minnamin...,ham
21,ham,Im going to try for 2 months ha ha only joking,ham
24,ham,Ffffffffff. Alright no way I can meet up with ...,ham
25,ham,Just forced myself to eat a slice. I'm really ...,ham
28,ham,"I'm back &amp; we're packing the car now, I'll...",ham
35,ham,Yup... Ok i go home look at the timings then i...,ham
56,spam,Congrats! 1 year special cinema pass for 2 is ...,spam
60,ham,Your gonna have to pick up a $1 burger for you...,ham
62,ham,Its a part of checking IQ,ham


# Confusion Matrix and Accuracy, Precision, Recall and F-Score Measure

In [10]:
from tabulate import tabulate

measures = classifier_performance(test)

print tabulate([["Positive",measures["True_Positives"], measures["False_Positives"]],
                 ["Negative",measures["False_Negatives"], measures["True_Negatives"]]],
               ["Positive", "Negative"], tablefmt="grid")


+----------+------------+------------+
|          |   Positive |   Negative |
| Positive |         55 |          1 |
+----------+------------+------------+
| Negative |         84 |        974 |
+----------+------------+------------+


In [11]:
print tabulate([[str(round(measures["Accuracy"],2)*100)+"%",
                 str(round(measures["Precision"],2)*100)+"%",
                 str(round(measures["Recall"],2)*100)+"%",
                 str(round(measures["F_Score"],2)*100)+"%"]],
               ["Accuracy","Precision","Recall","F_Score"],tablefmt="grid")

+------------+-------------+----------+-----------+
| Accuracy   | Precision   | Recall   | F_Score   |
| 92.0%      | 98.0%       | 40.0%    | 56.0%     |
+------------+-------------+----------+-----------+


# Plot the Accuracy & F-Score using different values of alpha

In [12]:
alphas=[2**i for i in range(-5,1)] #Values of Alpha



Train_Fscores=[]; Test_Fscores=[];  Train_Accuracy=[]; Test_Accuracy=[];

for alpha in alphas:
    
    test['Predict']=test['Message'].apply(lambda x: classifiy(x,alpha)) # Classifiy the Test Data 
    
    measures = classifier_performance(test) # Get the performance measures
    
    #Append Accuracy and Fscore to Test List
    Test_Fscores.append(measures['F_Score'])
    Test_Accuracy.append(measures['Accuracy'])
    
    #Do the same for Train Data
    train['Predict']=train['Message'].apply(lambda x: classifiy(x,alpha))
    
    measures = classifier_performance(train)
    
    Train_Fscores.append(measures['F_Score'])
    Train_Accuracy.append(measures['Accuracy'])
    


trace1 = go.Scatter( x = alphas, y = Train_Fscores, name = "F-Score") #Line Plot for F-Score in Train Data

trace2 = go.Scatter( x = alphas, y = Train_Accuracy, name = "Accuracy") #Line Plot for Accuracy in Train Data

# Customize Layout
layout = go.Layout( title="Train Data Performance",
                    xaxis=dict( title="Alphas", type= "category"),
                    yaxis=dict(range=[0.7,1],tickformat=".2%"))

fig=dict(data=[trace1,trace2],layout=layout)
iplot(fig)

In [13]:
trace1 = go.Scatter( x = alphas, y = Test_Fscores, name = "F-Score") #Line Plot for F-Score in Test Data

trace2 = go.Scatter( x = alphas, y = Test_Accuracy, name = "Accuracy") #Line Plot for Accuracy in Test Data

# Customize Layout
layout = go.Layout( title="Test Data Performance",
                    xaxis=dict( title="Alphas", type= "category"),
                    yaxis=dict(range=[0.3,1],tickformat=".2%"))

fig=dict(data=[trace1,trace2],layout=layout)
iplot(fig)