In [65]:
import pandas as pd
import numpy as np
import re

"""Importing dataset and randomising"""
dataset = pd.read_csv('messages', sep='\t', header=None, names=['Class', 'SMS'])
dataset = dataset.sample(frac=1)

"""Divide Dataset"""
def divideTable(dataset):
    return dataset.iloc[:,1],dataset.iloc[:,0]

"""Split function"""
def split(dataset,trainSize):
    return dataset.iloc[:trainSize],dataset.iloc[trainSize:]

"""Removing Punctuations and making letters lowercase"""
def remPuncAndLower(messages):
    messages = messages.str.replace('\W', ' ')
    messages = messages.str.lower()
    return messages

"""Splitting the messages and making a set of unigrams"""
def makeUnigrams(messages):
    messages = messages.str.split()
    unigramSet = set()
    for i in messages:
        for word in i:
            unigramSet.add(word)
    unigramList = list(set(list(unigramSet)))
    return messages, unigramList

"""Calculating frequencies of each unigram"""
def makeFreqUni(unigramList,messages,trainSize):
    freqUnigram = dict()
    for i in range(len(unigramList)):
        freqUnigram[unigramList[i]] = ([0]*trainSize)
    
    for i in range(len(messages)):
        msgList = messages.iloc[i]
        for j in msgList:
            freqUnigram[j][i] += 1

    return pd.DataFrame(freqUnigram)

"""Calculating conditional Probabilities"""
def calcProb(data,unigramList):
    condProb = dict()
    for i in unigramList:
        condProb[i] = 0
    
    nosLabels = data.shape[0]
    j = 2
    for i in unigramList:
        nosSuccess = data.iloc[:,j].sum()
        condProb[i] = nosSuccess/nosLabels
        j += 1
    
    return condProb
    

"""Classifying test message"""
def classify(testMsg,probSpam,probHam,probCondSpam,probCondHam):
    testMsg = re.sub('\W', ' ', testMsg)
    testMsg = testMsg.lower().split()
    
    probSpam = probSpam
    probHam = probHam
    
    for word in testMsg:
        if word in condProbSpam:
            probSpam *= condProbSpam[word]
        if word in condProbHam:
            probHam *= condProbHam[word]
    
    if probSpam > probHam:
        return 'spam'
    else:
        return 'ham'

In [66]:
"""Splitting into training and test set"""
trainSize = int(0.8*dataset.shape[0])
testSize = dataset.shape[0] - trainSize
x,y = divideTable(dataset)
x_train, x_test = split(x,trainSize)
y_train, y_test = split(y,trainSize)

"""Cleaning training set"""
x_train = remPuncAndLower(x_train)
x_train, unigramList = makeUnigrams(x_train)

"""Creating dataset of frequency of each unigram"""
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
freqUnigram = makeFreqUni(unigramList,x_train,trainSize)

In [67]:
freqUnigram = np.array(freqUnigram)
freqUnigram[:,:] = 1*(freqUnigram[:,:] >= 1)
freqUnigram = pd.DataFrame(freqUnigram)

In [68]:
"""Re-Joining dataset to unigram"""
x_train = x_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
freqUnigram = pd.concat([y_train,x_train,freqUnigram],axis=1)

"""Splitting it into spam and ham set"""
spam = freqUnigram[freqUnigram['Class'] == 'spam']
ham = freqUnigram[freqUnigram['Class'] == 'ham']

In [69]:
"""Calculating vals"""
probSpam = len(spam)/len(freqUnigram)
probHam = len(ham)/len(freqUnigram)

spam = spam.reset_index(drop = True)
ham = ham.reset_index(drop = True)
condProbSpam = calcProb(spam,unigramList)
condProbHam = calcProb(ham,unigramList)


In [70]:
y_pred = list()
for i in range(len(x_test)):
    y_pred.append(classify(x_test.iloc[i],probSpam,probHam,condProbSpam,condProbHam))

In [71]:
accuracy = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test.iloc[i]:
        accuracy += 1
print("The accuracy of the model is : ",accuracy/testSize * 100,"%")

The accuracy of the model is :  95.02712477396021 %
