In [64]:
import pandas as pd
import numpy as np
import re

"""Importing dataset and randomising"""
dataset = pd.read_csv('messages', sep='\t', header=None, names=['Class', 'SMS'])
dataset = dataset.sample(frac=1)

In [65]:
"""Divide Dataset"""
def divideTable(dataset):
    return dataset.iloc[:,1],dataset.iloc[:,0]

"""Split function"""
def split(dataset,trainSize):
    return dataset.iloc[:trainSize],dataset.iloc[trainSize:]

"""Removing Punctuations and making letters lowercase"""
def remPuncAndLower(messages):
    messages = messages.str.replace('\W', ' ')
    messages = messages.str.lower()
    return messages

"""Splitting the messages and making a set of unigrams"""
def makeUnigrams(messages):
    messages = messages.str.split()
    unigramSet = set()
    for i in messages:
        for word in i:
            unigramSet.add(word)
    unigramList = list(set(list(unigramSet)))
    return messages, unigramList

"""Calculating frequencies of each unigram"""
def makeFreqUni(unigramList,messages,trainSize):
    freqUnigram = dict()
    for i in range(len(unigramList)):
        freqUnigram[unigramList[i]] = ([0]*trainSize)
    
    for i in range(len(messages)):
        msgList = messages.iloc[i]
        for j in msgList:
            freqUnigram[j][i] += 1

    return pd.DataFrame(freqUnigram)

"""Calculating phi"""
def calcPhi(messages,output,label):
    count = 0
    for i in range(len(messages)):
        if output.iloc[i] == label:
            count += 1
    return count/len(messages)

"""calculating mean of all features for a class"""
def calcMean(data):
    meanList = list()
    for i in range(2,data.shape[1]):
        arr = np.array(data.iloc[:,i])
        meanList.append(np.mean(arr))
    return meanList
    
"""Calculatinf standard deviation of all features of a class"""    
def calcStd(data):
    stdList = list()
    for i in range(2,data.shape[1]):
        arr = np.array(data.iloc[:,i])
        stdList.append(np.std(arr))
    return stdList


In [66]:
"""Splitting into training and test set"""
trainSize = int(0.8*dataset.shape[0])
testSize = dataset.shape[0] - trainSize
x,y = divideTable(dataset)
x_train, x_test = split(x,trainSize)
y_train, y_test = split(y,trainSize)

In [67]:
"""Cleaning training set"""
x_train = remPuncAndLower(x_train)
x_train, unigramList = makeUnigrams(x_train)

In [68]:
"""Creating dataset of frequency of each unigram"""
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
freqUnigram = makeFreqUni(unigramList,x_train,trainSize)

In [69]:
"""Seprating sets"""
freqUnigram = pd.concat([y_train,x_train,freqUnigram],axis = 1)
spam = freqUnigram[freqUnigram['Class'] == 'spam']
ham = freqUnigram[freqUnigram['Class'] == 'ham']

In [70]:
"""Obtaining phi values, and mean and standrard deviation of each feature"""
phiSpam = len(spam)/len(y_train)
phiHam = len(ham)/len(y_train)
meanSpam = calcMean(spam)
stdSpam = calcStd(spam)
meanHam = calcMean(ham)
stdHam = calcStd(ham)

In [71]:
"""Calculating conditional probability"""
def prob(unigramList,freqTemp,meanList,stdList):
    condProb = 1
    nfeatures = len(unigramList)
    for i in range(nfeatures):
        numerator = np.exp(-(((freqTemp[i] - meanList[i])**2)/(2*(stdList[i]**2))+0.1))
        denominator = np.sqrt(2*np.pi*(stdList[i]**2)+0.1)
        condProb *= (numerator/denominator)
    return condProb
    

"""Classifying test message"""
def classify(testMsg,unigramList,meanHam,meanSpam,stdHam,stdSpam,phiHam,phiSpam):
    testMsg = re.sub('\W', ' ', testMsg)
    testMsg = testMsg.lower().split()
    
    freqTemp = np.zeros(len(unigramList))
    j = 0
    for i in unigramList:
        if i in testMsg:
            freqTemp[j] += 1
        j += 1
    
    probSpam = (prob(unigramList,freqTemp,meanSpam,stdSpam))*phiSpam
    probHam = (prob(unigramList,freqTemp,meanHam,stdHam))*phiHam
    
    if probSpam > probHam:
        return 'spam'
    else:
        return 'ham'
    


In [72]:
y_pred = list()
for i in range(len(x_test)):
    y_pred.append(classify(x_test.iloc[i],unigramList,meanHam,meanSpam,stdHam,stdSpam,phiHam,phiSpam))

  
  


In [73]:
accuracy = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test.iloc[i]:
        accuracy += 1
print("The accuracy of the model is : ",accuracy/testSize * 100,"%")

The accuracy of the model is :  85.80470162748644 %
