In [1]:
import pandas as pd
sms_spam = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['Label', 'SMS'])
sms_spam.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
sms_spam['Label'].unique()

array(['ham', 'spam'], dtype=object)

In [3]:
sms_spam['Label'].value_counts(normalize = True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

20% test 80% training

In [4]:
train_index = round(len(sms_spam)*0.8)

randomize the dataset then split into training and test set

In [7]:
set_random = sms_spam.sample(frac=1,random_state=1)

training_set = set_random[:train_index].reset_index(drop=True)
training_set['Label'].value_counts(normalize = True)


ham     0.86541
spam    0.13459
Name: Label, dtype: float64

In [8]:
test_set = set_random[train_index:].reset_index(drop=True)
test_set['Label'].value_counts(normalize = True)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

In [9]:
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ').str.lower()
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [10]:
test_set['SMS'] = test_set['SMS'].str.replace('\W', ' ').str.lower()
test_set.head()

Unnamed: 0,Label,SMS
0,ham,later i guess i needa do mcat study too
1,ham,but i haf enuff space got like 4 mb
2,spam,had your mobile 10 mths update to latest oran...
3,ham,all sounds good fingers makes it difficult ...
4,ham,all done all handed in don t know if mega sh...


work with training set

In [11]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for text in training_set['SMS']:
    for word in text:
        if word not in vocabulary:
            vocabulary.append(word)

a dict containing unique words frequency in each text

In [12]:
word_count_per_text = {unique_word: [0]*len(training_set['SMS']) for unique_word in vocabulary}

for index, text in enumerate(training_set['SMS']):
    for word in text:
        word_count_per_text[word][index] +=1

In [14]:
word_set = pd.DataFrame(word_count_per_text)
word_set.head()

Unnamed: 0,0,00,000,000pes,008704050406,0089,01223585334,02,0207,02072069400,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [15]:
#put the label back 
training_clean = pd.concat([training_set,word_set], axis = 1)
training_clean.head()

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [16]:
#separate the labels
spam_text = training_clean[training_clean["Label"] == "spam"]
no_spam_text = training_clean[training_clean["Label"] == "ham"]

In [17]:
#calculating the P(spam) and P(not spam)
prob_spam = len(spam_text)/len(training_clean)
prob_no_spam = 1 - prob_spam

N_Spam is equal to the number of words in all the spam messages 

Laplace smoothing = 1

In [18]:
n_spam = spam_text['SMS'].apply(len).sum()

n_no_spam = no_spam_text['SMS'].apply(len).sum()

n_vocabulary = len(vocabulary)

alpha = 1

In [19]:
#initiate dict for word probability
prob_word_given_spam_dict = {unique_word : 0 for unique_word in vocabulary}
prob_word_given_nospam_dict = {unique_word : 0 for unique_word in vocabulary}

#for spam text
for word in vocabulary:
    n_word_given_spam = spam_text[word].sum()
    prob_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    prob_word_given_spam_dict[word] = prob_word_given_spam
    
#for no spam text
for word in vocabulary:
    n_word_given_nospam = no_spam_text[word].sum()
    prob_word_given_nospam = (n_word_given_nospam + alpha) / (n_no_spam + alpha*n_vocabulary)
    prob_word_given_nospam_dict[word] = prob_word_given_nospam

Create method to calculate probability spam/no spam given words... then compare
- if P(Spam|words) > P(Not Spam|words)-> spam
- if P(Not Spam|words) > P(Spam|words) -> no spam


In [32]:
import re

def classify(text):
    
    text = re.sub('\W', ' ', text)
    text = text.lower().split()
    
    prob_spam_given_words = prob_spam
    prob_nospam_given_words = prob_no_spam

    for word in text:
        if word in prob_word_given_spam_dict:
            prob_spam_given_words *= prob_word_given_spam_dict[word]
            
        if word in prob_word_given_nospam_dict:
            prob_nospam_given_words *= prob_word_given_nospam_dict[word]
            
    
    if prob_nospam_given_words > prob_spam_given_words:
        return 'ham'
    elif prob_nospam_given_words < prob_spam_given_words:
        return 'spam'
    else:
        return 'equal proabilities'

now check

In [33]:
classify("Sounds good, then see u there")

'ham'

apply to test set

In [34]:
test_set['Predicted'] = test_set['SMS'].apply(classify)
test_set.head(3)

Unnamed: 0,Label,SMS,Predicted
0,ham,later i guess i needa do mcat study too,ham
1,ham,but i haf enuff space got like 4 mb,ham
2,spam,had your mobile 10 mths update to latest oran...,spam


accuracy = number of correctly classified texts / total texts

In [35]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
    row = row[1]
    if row["Label"] == row["Predicted"]:
        correct += 1

accuracy = correct/total
print(accuracy)

0.9874326750448833
