In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string
import re

# Don't collapse Pandas Dataframes:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
## Data Collection & Preparation:
df = pd.read_csv('spam-labels.csv', encoding='ISO-8859-1')
df = df[['spam', 'text']] # spam = label
df['spam'] = df['spam'].apply(lambda row: True if row == 'spam' else False) # Convert Text Labeled as 'Spam' to True, else False
df['text'] = df['text'].apply(lambda row: row.lower().translate(str.maketrans('', '', string.punctuation))) # Clean Data: lowercase, remove punctuation
df.head()

Unnamed: 0,spam,text
0,False,go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
1,False,ok lar joking wif u oni
2,True,free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s
3,False,u dun say so early hor u c already then say
4,False,nah i dont think he goes to usf he lives around here though


In [3]:
## EDA:
## Spam Examples:
print('Spam Examples:')
for text in df[df['spam'] == True].iloc[:5]['text']:
    print(text)

print('\n')
## Non-Spam Examples:
print('Non-Spam Examples:')
for text in df[df['spam'] == False].iloc[:5]['text']:
    print(text)

Spam Examples:
free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s
freemsg hey there darling its been 3 weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send å£150 to rcv
winner as a valued network customer you have been selected to receivea å£900 prize reward to claim call 09061701461 claim code kl341 valid 12 hours only
had your mobile 11 months or more u r entitled to update to the latest colour mobiles with camera for free call the mobile update co free on 08002986030
six chances to win cash from 100 to 20000 pounds txt csh11 and send to 87575 cost 150pday 6days 16 tsandcs apply reply hl 4 info


Non-Spam Examples:
go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
ok lar joking wif u oni
u dun say so early hor u c already then say
nah i dont think he goes to usf he lives around here though
even my

In [4]:
#######################################################
## Split Dataset into Training and Validation Datasets:
from sklearn.model_selection import train_test_split

X = df.loc[:, 'text'].values
print('\nFeature Dataset Shape: (instances, features)')
print('Total Dataset shape (X): {0}'.format(X.shape))
print(X)
y = df.loc[:, 'spam'].values.astype(np.float32).ravel() # Outcome = 1/0; Success/Failure
print('\nTrue Outcomes (Examples) shape (y): {0}'.format(y.shape))
print(y)
print('\n')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) # From Module: sklearn.model_selection; test_size = % allocated to Validation Dataset

print('Training Feature shape (X_train): {0}'.format(X_train.shape))
X_train_df = pd.DataFrame(X_train, columns = ['text'])
print('Training Outcome shape (y_train): {0}'.format(y_train.shape))
y_train_df = pd.DataFrame(y_train, columns = ['spam'])
print('Validation Feature shape (X_test): {0}'.format(X_test.shape))
X_test_df = pd.DataFrame(X_test, columns = ['text'])
print('Validation Outcome shape (y_test): {0}'.format(y_test.shape))
y_test_df = pd.DataFrame(y_test, columns = ['spam'])

training_data = pd.concat([y_train_df, X_train_df], axis=1)

training_data_spam_true = training_data[training_data['spam'] == 1.0]
training_data_spam_false = training_data[training_data['spam'] == 0.0]
# training_data_spam_true.head()
# training_data_spam_false.head()

training_data.head()


Feature Dataset Shape: (instances, features)
Total Dataset shape (X): (5572,)
['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'
 'ok lar joking wif u oni'
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s'
 ... 'pity  was in mood for that soany other suggestions'
 'the guy did some bitching but i acted like id be interested in buying something else next week and he gave it to us for free'
 'rofl its true to its name']

True Outcomes (Examples) shape (y): (5572,)
[0. 0. 1. ... 0. 0. 0.]


Training Feature shape (X_train): (4736,)
Training Outcome shape (y_train): (4736,)
Validation Feature shape (X_test): (836,)
Validation Outcome shape (y_test): (836,)


Unnamed: 0,spam,text
0,0.0,we regret to inform u that the nhs has made a mistakeu were never actually bornplease report 2 yor local hospital 2b terminatedwe r sorry 4 the inconvenience
1,0.0,4 tacos 1 rajas burrito right
2,0.0,alright babe
3,0.0,dear umma she called me now
4,0.0,well theres a pattern emerging of my friends telling me to drive up and come smoke with them and then telling me that im a weed fiendmake them smoke too muchimpede their doing other things so you see how im hesitant


In [5]:
y_train_df = pd.DataFrame(y_train, columns = ['spam'])
percent_training_data_text_labeled_as_spam_true = y_train_df['spam'].mean()
print('% of text in training outcome data that are labeled as "spam": {0}%'.format(percent_training_data_text_labeled_as_spam_true * 100))


% of text in training outcome data that are labeled as "spam": 13.492397964000702%


In [6]:
# training_data.head()

In [7]:
#######################
## Clean Training Data: (Spam = True)
training_data_spam_true_list = training_data_spam_true['text'].astype(str).tolist() # Turn DataFrame to List
training_data_spam_true_string = ''.join(training_data_spam_true_list) # Combine List Values into a Single String
training_data_spam_true_string = training_data_spam_true_string.translate(str.maketrans('', '', string.punctuation)) ## Remove Punctuation Characters
training_data_spam_true_string = training_data_spam_true_string.replace('\n','') ## Remove Line Breaks
training_data_spam_true_string = re.sub(r'[0-9]+', '', training_data_spam_true_string) ## Remove Numerical Characters
training_data_spam_true_string = training_data_spam_true_string.lower() ## Lowercase Characters

#######################
## Clean Training Data: (Spam = False)
training_data_spam_false_list = training_data_spam_false['text'].astype(str).tolist() # Turn DataFrame to List
training_data_spam_false_string = ''.join(training_data_spam_false_list) # Combine List Values into a Single String
training_data_spam_false_string = training_data_spam_false_string.translate(str.maketrans('', '', string.punctuation)) ## Remove Punctuation Characters
training_data_spam_false_string = training_data_spam_false_string.replace('\n','') ## Remove Line Breaks
training_data_spam_false_string = re.sub(r'[0-9]+', '', training_data_spam_false_string) ## Remove Numerical Characters
training_data_spam_false_string = training_data_spam_false_string.lower() ## Lowercase Characters

In [8]:
############################################
## Get a List of All Text where spam == True
training_data_spam_true_word_list = training_data_spam_true_string.split()
# print(training_data_spam_true_word_list)

#############################################
## Get a List of All Text where spam == False
training_data_spam_false_word_list = training_data_spam_false_string.split()
# print(training_data_spam_false_word_list)

#############################
## Get a List of Common Words where spam == True AND spam == False
training_data_common_word_list = set(training_data_spam_true_word_list).intersection(set(training_data_spam_false_word_list))
# print(training_data_common_word_list)

In [9]:
####################################################
## Create "Bag of Words" (via creating dictionaries)
probability_spam_true_list = []
for common_word in training_data_common_word_list:
    probability_spam_true = training_data_spam_true_word_list.count(common_word) / len(training_data_spam_true_word_list)
    probability_spam_true_list.append(probability_spam_true)

####################################################
## Create "Bag of Words" (via creating dictionaries)
probability_spam_false_list = []
for common_word in training_data_common_word_list:
    probability_spam_false = training_data_spam_false_word_list.count(common_word) / len(training_data_spam_false_word_list)
    probability_spam_false_list.append(probability_spam_false)


In [10]:
word_df = pd.DataFrame(training_data_common_word_list, columns=['word'])
prob_spam_true_df = pd.DataFrame(probability_spam_true_list, columns=['prob spam true'])
prob_spam_false_df = pd.DataFrame(probability_spam_false_list, columns=['prob spam false'])
spam_probabilities_df = pd.concat([word_df, prob_spam_true_df, prob_spam_false_df], axis=1)
spam_probabilities_df.head()

Unnamed: 0,word,prob spam true,prob spam false
0,class,0.000151,0.000531
1,believe,0.000151,0.000266
2,best,0.000602,0.000493
3,same,7.5e-05,0.000607
4,many,0.000151,0.000873


In [11]:
def predict_text_label_via_bag_of_words(list_of_text, df, percent_training_data_text_labeled_as_spam_true):
    valid_words_list = []
    for word in list_of_text:
        # print('word: {0}'.format(word))
        if word in training_data_common_word_list:
            valid_words_list.append(word)
        else:
            pass
            # print("'{0}' is not a valid word".format(word))
    # print('valid words: {0}'.format(valid_words_list))
    
    probability_spam_false_list = []
    probability_spam_true_list = []
    for valid_word in valid_words_list:
        for index, row in df.iterrows():
            word = row['word']
            probability_spam_true = row['prob spam true']
            probability_spam_false = row['prob spam false']
            
            if valid_word == word:
                # print('valid word matched: {0}'.format(valid_word))
                probability_spam_false_list.append(probability_spam_false)
                probability_spam_true_list.append(probability_spam_true)
            else:
                pass
    
    word_df = pd.DataFrame(valid_words_list, columns=['word'])
    probability_spam_true_df = pd.DataFrame(probability_spam_true_list, columns=['prob spam true'])
    probability_spam_false_df = pd.DataFrame(probability_spam_false_list, columns=['prob spam false'])
    predicted_labels_per_word_df = pd.concat([word_df, probability_spam_true_df, probability_spam_false_df], axis=1)
    # print(predicted_labels_per_word_df)
    
    # Calculate spam True/False scores as sum of logs for all probabilities:
    text_spam_true_score = sum([np.log(p) for p in probability_spam_true_list]) + np.log(percent_training_data_text_labeled_as_spam_true)
    text_spam_false_score = sum([np.log(p) for p in probability_spam_false_list]) + np.log(1-percent_training_data_text_labeled_as_spam_true)
    # Label as spam = True if text_spam_true_score >= text_spam_false_score
    spam_score = (text_spam_true_score >= text_spam_false_score)
    # print('Spam = True Score: {0}'.format(text_spam_true_score))
    text_spam_true_score_list = [text_spam_true_score]
    # print('Spam = False Score: {0}'.format(text_spam_false_score))
    text_spam_false_score_list = [text_spam_false_score]
    text = ' '.join(list_of_text) # Combine List Values into a Single String
    # print('Text "{0}" is Spam: {1}'.format(text, spam_score))
    text_spam_score_list = [spam_score]
    text_list = [text]
    predictions_df = pd.DataFrame(text_list, columns=['text'])
    text_spam_score_df = pd.DataFrame(text_spam_score_list, columns=['spam'])
    spam_true_score_df = pd.DataFrame(text_spam_true_score_list, columns=['spam true score'])
    spam_false_score_df = pd.DataFrame(text_spam_false_score_list, columns=['spam false score'])
    predictions_df = pd.concat([predictions_df, text_spam_score_df, spam_true_score_df, spam_false_score_df], axis=1)
    # print(predictions_df)
    return predictions_df


In [12]:
text = 'urgent call this number'
predictions_df = predict_text_label_via_bag_of_words(text.split(), spam_probabilities_df, percent_training_data_text_labeled_as_spam_true)
predictions_df

Unnamed: 0,text,spam,spam true score,spam false score
0,urgent call this number,True,-23.716381,-28.810789


In [13]:
text = 'hey do you want to go a movie tonight'
predictions_df = predict_text_label_via_bag_of_words(text.split(), spam_probabilities_df, percent_training_data_text_labeled_as_spam_true)
predictions_df

Unnamed: 0,text,spam,spam true score,spam false score
0,hey do you want to go a movie tonight,False,-59.250921,-52.449595


In [14]:
text = 'offer for unlimited money call now'
predictions_df = predict_text_label_via_bag_of_words(text.split(), spam_probabilities_df, percent_training_data_text_labeled_as_spam_true)
predictions_df

Unnamed: 0,text,spam,spam true score,spam false score
0,offer for unlimited money call now,True,-36.895945,-42.869068


In [15]:
text = 'are you at class yet'
predictions_df = predict_text_label_via_bag_of_words(text.split(), spam_probabilities_df, percent_training_data_text_labeled_as_spam_true)
predictions_df

Unnamed: 0,text,spam,spam true score,spam false score
0,are you at class yet,False,-35.530141,-29.151471


In [16]:
## Validate
X_test_df = pd.DataFrame(X_test, columns=['text'])
X_test_list = X_test_df['text'].to_list()

validation_predictions_df = pd.DataFrame()
for text in X_test_list:
    predictions_df = predict_text_label_via_bag_of_words(text.split(), spam_probabilities_df, percent_training_data_text_labeled_as_spam_true)
    validation_predictions_df = validation_predictions_df.append(predictions_df)



In [17]:
validation_predictions_df = validation_predictions_df.drop(columns=['spam true score','spam false score'])
validation_predictions_df.reset_index(drop=True, inplace=True)
validation_predictions_df.head()

Unnamed: 0,text,spam
0,funny fact nobody teaches volcanoes 2 erupt tsunamis 2 arise hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife natural disasters just happens,True
1,i sent my scores to sophas and i had to do secondary application for a few schools i think if you are thinking of applying do a research on cost also contact joke ogunrinde her school is one me the less expensive ones,False
2,we know someone who you know that fancies you call 09058097218 to find out who pobox 6 ls15hb 150p,False
3,only if you promise your getting out as soon as you can and youll text me in the morning to let me know you made it in ok,False
4,congratulations ur awarded either å£500 of cd gift vouchers free entry 2 our å£100 weekly draw txt music to 87066 tncs wwwldewcom1win150ppmx3age16,True


In [18]:
validation_data = pd.concat([X_test_df, y_test_df], axis=1)
spam_boolean_list = []
for index, row in validation_data.iterrows():
    spam = row['spam']
    if spam == 1.0:
        spam_boolean_list.append(True)
    else:
        spam_boolean_list.append(False)

validation_data = validation_data.drop(columns=['spam'])
spam_boolean_df = pd.DataFrame(spam_boolean_list, columns=['spam'])
validation_data = pd.concat([validation_data, spam_boolean_df], axis=1)

In [19]:
spam_prediction_true_df = validation_predictions_df[(validation_predictions_df['spam'] == True)]
spam_prediction_false_df = validation_predictions_df[(validation_predictions_df['spam'] == False)]
validation_data_true_df = validation_data[(validation_data['spam'] == True)]
validation_data_false_df = validation_data[(validation_data['spam'] == False)]

validation_predictions_df.compare(validation_data, keep_equal=False)


true_positives_df = pd.merge(validation_data_true_df, spam_prediction_true_df, how='left', on='text') # text = spam; prediction = spam
false_positives_df = pd.merge(spam_prediction_true_df, validation_data_false_df, how='left', on='text') # text = spam; prediction = spam
true_negatives_df = pd.merge(validation_data_false_df, spam_prediction_false_df, how='left', on='text') # text = spam; prediction = spam
false_negatives_df = pd.merge(spam_prediction_false_df, validation_data_true_df, how='left', on='text') # text = spam; prediction = spam

# true_positives_count = true_positives_df['text'].count()
# false_positives_count = false_positives_df['text'].count()
# true_negatives_count = true_negatives_df['text'].count()
# false_negatives_count = false_negatives_df['text'].count()

true_positives_count = true_positives_df[(true_positives_df['spam_y'] == True)]['spam_y'].count()
false_positives_count = false_positives_df[(false_positives_df['spam_y'] == False)]['spam_y'].count()
true_negatives_count = true_negatives_df[(true_negatives_df['spam_y'] == False)]['spam_y'].count()
false_negatives_count = false_negatives_df[(false_negatives_df['spam_y'] == True)]['spam_y'].count()

print('TP: {0}'.format(true_positives_count))
print('FP: {0}'.format(false_positives_count))
print('TN: {0}'.format(true_negatives_count))
print('FP: {0}'.format(false_negatives_count))


TP: 76
FP: 22
TN: 648
FP: 6


In [20]:
recall = true_positives_count / (true_positives_count + false_negatives_count)
precision = true_positives_count / (true_positives_count + false_positives_count)

In [21]:
print('Spam Detection Recall: {0}%'.format(recall)) # Recall = TP / (TP + FN)
print('Spam Detection Precision: {0}%'.format(precision)) # Precision = TP / (TP + FP)

Spam Detection Recall: 0.926829268292683%
Spam Detection Precision: 0.7755102040816326%
