In [1]:
# In this kernel, we gonna build up a tool to classify the SMS messages into 2 categories: spam or ham!!!
# We're gonna use the Naive Bayesian classifier first, and try to apply the alternatives later.

# Firstly, we import the data using 'pandas', 'numpy', and 'nltk.corpus' packages
import pandas as pd
import numpy as np
from nltk.corpus import stopwords as sw

data = pd.read_csv('spam.csv', encoding='latin-1')

# Since the 3 last column of the data set is empty, we will drop them off
data = data.iloc[:,:2]

# Check the shape of the data
print(data.head())

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [2]:
# We will do some pre-analysing process, like eliminating the stopwords, eliminating the punctuation, etc.
# Firstly, we need to transform all of the words in every messages to the lower case form. This action helps to avoid the same words
# appearing many times, e.g. 'Analysis', 'analysis', 'analYsis', etc.
data['v2'] = data['v2'].apply(lambda x: ' '.join(i.lower() for i in x.split()))

# Next, remove the punctuation
data['v2'] = data['v2'].str.replace('[^\w\s]', '')

# Lastly, we remove stopwords in each message. To do that, create a list of stop words, which is available in the 'stopwords' package
stop = sw.words('english')
data['v2'] = data['v2'].apply(lambda x: ' '.join(i for i in x.split() if i not in stop))

# So, stop is a list containing stop words in english
print(data.head())

     v1                                                 v2
0   ham  go jurong point crazy available bugis n great ...
1   ham                            ok lar joking wif u oni
2  spam  free entry 2 wkly comp win fa cup final tkts 2...
3   ham                u dun say early hor u c already say
4   ham        nah dont think goes usf lives around though


In [3]:
# Since we have totally 5572 messages, the training set will include around 80% of the data set, which is 4457 messages.
train = data.iloc[:4458]
test = data.iloc[4458:]
print(train.head())

# Then, we compute the number of ham and spam SMS correspondingly
ham = train.loc[:,'v1'][train.loc[:,'v1'] == 'ham'].count()
spam = train.loc[:,'v1'][train.loc[:,'v1'] == 'spam'].count()

     v1                                                 v2
0   ham  go jurong point crazy available bugis n great ...
1   ham                            ok lar joking wif u oni
2  spam  free entry 2 wkly comp win fa cup final tkts 2...
3   ham                u dun say early hor u c already say
4   ham        nah dont think goes usf lives around though


In [4]:
# First analysis, count the number of words in each message
train.loc[:,'count_word'] = train.loc[:,'v2'].apply(lambda x: len(str(x).split(" ")))
print(train.head())

     v1                                                 v2  count_word
0   ham  go jurong point crazy available bugis n great ...          16
1   ham                            ok lar joking wif u oni           6
2  spam  free entry 2 wkly comp win fa cup final tkts 2...          23
3   ham                u dun say early hor u c already say           9
4   ham        nah dont think goes usf lives around though           8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [5]:
# Then, we count the number of characters
train.loc[:,'count_char'] = train.loc[:,'v2'].apply(lambda x: len(x))
print(train.head())

     v1                                                 v2  count_word  \
0   ham  go jurong point crazy available bugis n great ...          16   
1   ham                            ok lar joking wif u oni           6   
2  spam  free entry 2 wkly comp win fa cup final tkts 2...          23   
3   ham                u dun say early hor u c already say           9   
4   ham        nah dont think goes usf lives around though           8   

   count_char  
0          82  
1          23  
2         135  
3          35  
4          43  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [6]:
# Our target is extracting words in messages and count the frequency of these words appearing in spam and ham SMS.
dic_words_ham = {}
dic_words_spam = {}

# Idea is to loop over rows of the training set, each row is a single message, and we just add the words to the dictionary
for index, i in train.iterrows():
    # To do it, split each word in the message
    sep_words = i['v2'].split(" ")
    for j in sep_words:
        if i['v1'] == 'ham':
            if j in dic_words_ham:
                dic_words_ham[j] += 1
            else:
                dic_words_ham[j] = 1
        else:
            if j in dic_words_spam:
                dic_words_spam[j] += 1
            else:
                dic_words_spam[j] = 1

# Now, we have two dictionaries, one contains the words in spam SMS and the other consists of words in ham SMS.
# Then, we try to combine 2 dictionaries into a sole table. Firstly, we try to create a list of words in both types of SMS
words = list(dic_words_ham.keys());
for i in dic_words_spam.keys():
    if i not in words:
        words.append(i)
        
# Next, we count the frequency of each word in both spam and ham SMS, returna data frame.
spam_freq = []
ham_freq = []

for i in words:
    if i in dic_words_spam.keys():
        spam_freq.append(dic_words_spam[i])
    else:
        spam_freq.append(0)

for i in words:
    if i in dic_words_ham.keys():
        ham_freq.append(dic_words_ham[i])
    else:
        ham_freq.append(0)

In [7]:
# From a data frame by three columns 'words', 'spam_freq' and 'ham_freq'. First, create a dictionary
dic_freq_words = {'words': words, 'spam_freq': spam_freq, 'ham_freq': ham_freq}
freq_words = pd.DataFrame.from_dict(dic_freq_words)

# We want to have one more column that is the total frequency of these words, in both spam and ham SMS
freq_words.loc[:,'total_freq'] = freq_words.loc[:,'spam_freq'] + freq_words.loc[:,'ham_freq']

print(freq_words)

               words  spam_freq  ham_freq  total_freq
0                 go         25       193         218
1             jurong          0         1           1
2              point          0        11          11
3              crazy          3         8          11
4          available          3        12          15
5              bugis          0         6           6
6                  n          6       112         118
7              great          6        79          85
8              world          1        26          27
9                 la          0         5           5
10                 e          6        63          69
11            buffet          0         2           2
12              cine          0         7           7
13               got          3       189         192
14             amore          0         1           1
15               wat          0        76          76
16                ok          5       227         232
17               lar        

In [8]:
# After all materials are prepared, we execute the Naive Bayesian classifier to classify the SMS
# We will compute the posterior probability of ham and spam SMS. To be more concrete, we will compute P(ham|data) and P(spam|data)

# The priors P(ham) and P(spam). These probabilities are straight-forward, since they are just the proportion of ham and spam SMS
prior_ham = ham/(ham+spam)
prior_spam = spam/(ham+spam)

# Compute the likelihoods P(bodytext|ham) and p(bodytext|spam).
# Firstly, number of words in ham and spam category
count_word_ham = 0
count_word_spam = 0

for index, i in enumerate(freq_words.loc[:,'words']):
    if freq_words.loc[index,'ham_freq'] != 0:
        count_word_ham += 1
    elif freq_words.loc[index,'spam_freq'] != 0:
        count_word_spam += 1

# The likelihood P(bodytext|ham) = PRODUCT(P(each_word|ham)). However, there are some words that appear in spam SMS, 
# but not in ham SMS, then P(that_word) = 0. This will make PRODUCT(P(each_word|ham)) = 0. To solve it, we compute P(each_word|ham)
# as (frequency of that word in ham + 1)/(total words in ham + number of distinct words in both categories)

# Number of distinct words: we can easily count them by looking at the 'total_freq' column
count_dist_words = freq_words.loc[:,'total_freq'][freq_words.loc[:,'total_freq'] == 1].count()

# Now, we come back to the original data set to compute the likelihood of each SMS. We have to notice one more thing before computing
# Since we have so many words, and the frequency of a single word is negilible compared to the number of all words, the P(each_word|ham)
# or P(each_word|spam) is prohibitive close to 0, and when the products are taken over each SMS, they could return 0 for long messages.

# To solve this problem, we take logarithm for all probabilities.
posteriors_ham = []
posteriors_spam = []

for i in list(train.loc[:,'v2']):
    aa = str(i).split(' ')
    bb = set(aa)
    freq_word_ham = []
    freq_word_spam = []
    for j in bb:
        index_j = list(freq_words.loc[:,'words']).index(j)
        freq_word_ham.append(np.log((freq_words.loc[index_j, 'ham_freq'] + 1)/(count_word_ham + count_dist_words)))
        freq_word_spam.append(np.log((freq_words.loc[index_j, 'spam_freq'] + 1)/(count_word_spam + count_dist_words)))
    ham = sum(freq_word_ham) + np.log(prior_ham)
    spam = sum(freq_word_spam) + np.log(prior_spam)
    posteriors_ham.append(ham)
    posteriors_spam.append(spam)

In [9]:
# Now, stick these two columns into the original data set (just for the easy observation)
train.loc[:,'post_ham'] = posteriors_ham
train.loc[:,'post_spam'] = posteriors_spam

# It is the last step when we will decide whether a message is spam or ham by compare there posteriors in ham and spam categories
# If posterior_ham > posterior_spam, that message is classified as ham and vice versa.
prediction = []
for index, i in enumerate(train.loc[:,'post_ham']):
    if i > train.loc[index,'post_spam']:
        prediction.append('ham')
    else:
        prediction.append('spam')

train['prediction'] = prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
# Assess the classifier
# 1. Using the training set
precision = sum(train.loc[:,'v1'] == train.loc[:,'prediction'])/len(train.loc[:,'prediction'])
print(precision)

# It is 99.39% correct, impressive performance of Naive Bayesian classifier!!!

0.993943472409152
