In [3]:
import pandas as pd
import numpy as np
import re

"""Importing dataset and randomising"""
dataset = pd.read_csv('messages', sep='\t', header=None, names=['Class', 'SMS'])
dataset = dataset.sample(frac=1)

In [4]:
"""Divide Dataset"""
def divideTable(dataset):
    return dataset.iloc[:,1],dataset.iloc[:,0]

"""Split function"""
def split(dataset,trainSize):
    return dataset.iloc[:trainSize],dataset.iloc[trainSize:]

"""Removing Punctuations and making letters lowercase"""
def remPuncAndLower(messages):
    messages = messages.str.replace('\W', ' ')
    messages = messages.str.lower()
    return messages

"""Splitting the messages and making a set of unigrams"""
def makeUnigrams(messages):
    messages = messages.str.split()
    unigramSet = set()
    for i in messages:
        for word in i:
            unigramSet.add(word)
    unigramList = list(set(list(unigramSet)))
    return messages, unigramList

"""Calculating frequencies of each unigram"""
def makeFreqUni(unigramList,messages,trainSize):
    freqUnigram = dict()
    for i in range(len(unigramList)):
        freqUnigram[unigramList[i]] = ([0]*trainSize)
    
    for i in range(len(messages)):
        msgList = messages.iloc[i]
        for j in msgList:
            freqUnigram[j][i] += 1

    return pd.DataFrame(freqUnigram)

"""Calculating phi"""
def calcPhi(messages,output,label):
    count = 0
    for i in range(len(messages)):
        if output.iloc[i] == label:
            count += 1
    return count/len(messages)

"""calculating nos of words in messages of each class"""
def calcNos(x_train,y_train,freqUnigram):
    dataset2 = pd.concat([y_train,x_train],axis = 1)
    newSet = pd.concat([dataset2,freqUnigram],axis = 1)
    
    spam = newSet[newSet['Class'] == 'spam']
    ham = newSet[newSet['Class'] == 'ham']
    n_words_per_spam_message = spam['SMS'].apply(len)
    n_spam = n_words_per_spam_message.sum()
    n_words_per_ham_message = ham['SMS'].apply(len)
    n_ham = n_words_per_ham_message.sum()
    
    return n_spam,n_ham,spam,ham

"""Classifying test message"""
def classify(testMsg,phiSpam,phiHam,parSpam,parHam):
    testMsg = re.sub('\W', ' ', testMsg)
    testMsg = testMsg.lower().split()
    
    probSpam = phiSpam
    probHam = phiHam
    
    for word in testMsg:
        if word in parSpam:
            probSpam *= parSpam[word]
        if word in parHam:
            probHam *= parHam[word]
    
    if probSpam > probHam:
        return 'spam'
    else:
        return 'ham'

In [5]:
"""Splitting into training and test set"""
trainSize = int(0.8*dataset.shape[0])
testSize = dataset.shape[0] - trainSize
x,y = divideTable(dataset)
x_train, x_test = split(x,trainSize)
y_train, y_test = split(y,trainSize)

In [6]:
"""Cleaning training set"""
x_train = remPuncAndLower(x_train)
x_train, unigramList = makeUnigrams(x_train)

In [7]:
"""Creating dataset of frequency of each unigram"""
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
freqUnigram = makeFreqUni(unigramList,x_train,trainSize)

In [8]:
"""Calculating phi and Nos of words in spam and ham messages"""
phiHam = calcPhi(x_train,y_train,'ham')
phiSpam = calcPhi(x_train,y_train,'spam')
nSpam,nHam,spam,ham = calcNos(x_train,y_train,freqUnigram)
unigramSize = len(unigramList)

In [9]:
"""Calculating Parameters"""
parSpam = {word:0 for word in unigramList}
for word in unigramList:
    nWSpam = spam[word].sum() 
    pWSpam = (nWSpam + 1) / (nSpam + 1*unigramSize)
    parSpam[word] = pWSpam
    
parHam = {word:0 for word in unigramList}
for word in unigramList:
    nWHam = ham[word].sum() 
    pWHam = (nWHam + 1) / (nHam + 1*unigramSize)
    parHam[word] = pWHam

In [10]:
y_pred = list()
for i in range(len(x_test)):
    y_pred.append(classify(x_test.iloc[i],phiSpam,phiHam,parSpam,parHam))

In [11]:
accuracy = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test.iloc[i]:
        accuracy += 1
print("The accuracy of the model is : ",accuracy/testSize * 100,"%")

The accuracy of the model is :  98.55334538878843 %
