In [1]:
#We are building up a spam filter to classify messages as spam and non spam

#dataset link:https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [2]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['Label','SMS'])

In [4]:
df.shape

(5572, 2)

In [5]:
df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.tail()

Unnamed: 0,Label,SMS
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [7]:
df["Label"].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [8]:
df=df.sample(frac=1, random_state=1)
train = df.iloc[:4457,:].reset_index()
test = df.iloc[4457:,:].reset_index()

In [9]:
train.head()

Unnamed: 0,index,Label,SMS
0,1078,ham,"Yep, by the pretty sculpture"
1,4028,ham,"Yes, princess. Are you going to make me moan?"
2,958,ham,Welp apparently he retired
3,4642,ham,Havent.
4,4674,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [10]:
train["Label"].value_counts(normalize=True)

ham     0.86538
spam    0.13462
Name: Label, dtype: float64

In [11]:
test["Label"].value_counts(normalize=True)

ham     0.868161
spam    0.131839
Name: Label, dtype: float64

In [12]:
train["SMS"] = train["SMS"].str.replace('\W', ' ')


In [13]:
train["SMS"] = train["SMS"].str.lower()

In [14]:
train

Unnamed: 0,index,Label,SMS
0,1078,ham,yep by the pretty sculpture
1,4028,ham,yes princess are you going to make me moan
2,958,ham,welp apparently he retired
3,4642,ham,havent
4,4674,ham,i forgot 2 ask ü all smth there s a card on ...
5,5461,ham,ok i thk i got it then u wan me 2 come now or...
6,4210,ham,i want kfc its tuesday only buy 2 meals only ...
7,4216,ham,no dear i was sleeping p
8,1603,ham,ok pa nothing problem
9,1504,ham,ill be there on lt gt ok


In [15]:
train["SMS"]=train["SMS"].str.split()



In [16]:
train["SMS"].head()

0                    [yep, by, the, pretty, sculpture]
1    [yes, princess, are, you, going, to, make, me,...
2                      [welp, apparently, he, retired]
3                                             [havent]
4    [i, forgot, 2, ask, ü, all, smth, there, s, a,...
Name: SMS, dtype: object

In [17]:
vocabulary = []
for i in train["SMS"].values:
    for j in i:
        vocabulary.append(j)
        
vocabulary=set(vocabulary)

In [18]:
vocabulary=list(vocabulary)

In [19]:
word_counts_per_sms = {unique_word: [0] * len(train['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(train['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [20]:
word_count=pd.DataFrame(word_counts_per_sms)

In [21]:
word_count.head()

Unnamed: 0,0,00,000,000pes,008704050406,0089,01223585334,02,0207,02072069400,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [22]:
final_train=pd.concat([train, word_count])

In [23]:
p_ham=final_train["Label"].value_counts(normalize=True)["ham"]
p_spam=1-p_ham


In [24]:
n_vocabulary = len(vocabulary)
alpha = 1

In [None]:
temp = final_train.loc[final_train["Label"]=='spam','SMS']
k= temp.apply(lambda x: len(x))
n_spam = k.sum()
# https://app.dataquest.io/jupyter/notebooks/notebook/Basics.ipynb?dqMachineIp=10.142.15.202&dqPort=52692&dqToken=b25d3006dc4e2d92f0f12c0aad67be35&dqUsername=865b8080cfa3cf40a32fbe49f6673cbd#

In [None]:
temp_ham = final_train.loc[final_train["Label"]=='ham']
temp_spam = final_train.loc[final_train["Label"]=='spam']
k= temp_ham['SMS'].apply(lambda x: len(x))
n_ham = k.sum()


In [None]:
spam = {unique_word:0 for unique_word in vocabulary}
ham  = {unique_word:0 for unique_word in vocabulary}


# ham_train=final_train[final_train['Label']=='Ham']

In [None]:
# list_sms_spam = spam_train["SMS"].to_list()
# flatten_sms_spam = lambda t: [item for sublist in list_sms_spam for item in sublist]

In [None]:
for word in vocabulary:
    n_word_given_spam=temp_spam[word].sum()
    n_word_given_ham=temp_ham[word].sum()
    
    p_word_given_spam=(n_word_given_spam+alpha)/(n_spam+alpha*n_vocabulary)
    p_word_given_ham=(n_word_given_ham+alpha)/(n_ham+alpha*n_vocabulary)
    spam[word]=p_word_given_spam
    ham[word]=p_word_given_ham

In [None]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_words_product_spam=1
    p_words_product_ham=1
    
    
    for word in message:
        if(word in vocabulary):
        
            p_words_product_spam=spam[word]*p_words_product_spam
            p_words_product_ham=ham[word]*p_words_product_ham
        
   

    p_spam_given_message = p_words_product_spam*p_spam
    p_ham_given_message = p_words_product_ham*p_ham
    

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message >= p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    

In [None]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

In [None]:
classify('"Sounds good, Tom, then see u there"')

In [None]:
test['predicted'] = test['SMS'].apply(classify_test_set)
test.head()

In [None]:
correct=0
total=test.shape[0]