In [1]:
## Import Modules:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import string
import re

# Configure Modules:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
# Functions:
def predict_text_label_via_bag_of_words(list_of_text, df, percent_training_dataset_text_labeled_as_label_true, testing):
    valid_words_list = []
    for word in list_of_text:
        if word in training_dataset_common_word_list:
            valid_words_list.append(word)
        else:
            pass
    
    probability_label_false_list = []
    probability_label_true_list = []
    for valid_word in valid_words_list:
        for index, row in df.iterrows():
            word = row['word']
            probability_label_true = row['prob spam = true']
            probability_label_false = row['prob spam = false']
            
            if valid_word == word:
                probability_label_false_list.append(probability_label_false)
                probability_label_true_list.append(probability_label_true)
            else:
                pass
    
    word_df = pd.DataFrame(valid_words_list, columns=['word'])
    probability_label_true_df = pd.DataFrame(probability_label_true_list, columns=['prob spam = true'])
    probability_label_false_df = pd.DataFrame(probability_label_false_list, columns=['prob spam = false'])
    predicted_labels_per_word_df = pd.concat([word_df, probability_label_true_df, probability_label_false_df], axis=1)
    if testing == True:
        print(predicted_labels_per_word_df)
    else:
        pass
    
    # Calculate spam True/False scores as sum of logs for all probabilities:
    text_label_true_score = sum([np.log(p) for p in probability_label_true_list]) + np.log(percent_training_dataset_text_labeled_as_label_true)
    text_label_false_score = sum([np.log(p) for p in probability_label_false_list]) + np.log(1-percent_training_dataset_text_labeled_as_label_true)
    # Label as spam = True if text_label_true_score >= text_label_false_score
    spam_score = (text_label_true_score >= text_label_false_score)
    if testing == True:
        print('Spam = True Score: {0}'.format(text_label_true_score))
    else:
        pass
    text_label_true_score_list = [text_label_true_score]
    if testing == True:
        print('Spam = False Score: {0}'.format(text_label_false_score))
    else:
        pass
    text_label_false_score_list = [text_label_false_score]
    text = ' '.join(list_of_text) # Combine List Values into a Single String
    if testing == True:
        print('Text "{0}" is Spam: {1}'.format(text, spam_score))
    else:
        pass
    text_label_score_list = [spam_score]
    text_list = [text]
    predictions_df = pd.DataFrame(text_list, columns=['text'])
    text_label_score_df = pd.DataFrame(text_label_score_list, columns=['spam'])
    spam_true_score_df = pd.DataFrame(text_label_true_score_list, columns=['spam true score'])
    spam_false_score_df = pd.DataFrame(text_label_false_score_list, columns=['spam false score'])
    predictions_df = pd.concat([predictions_df, text_label_score_df, spam_true_score_df, spam_false_score_df], axis=1)
    return predictions_df


In [3]:
## Step 1: Define Problem
##########################
## Problem: ...
## Goal: Generate ... Label
## Labels Examples: Binary (True/False), Multiclass (X, Y, Z, etc...), Regression (Numerical Value)
###################################################################################################
## Outputs for Measuring Quality of Model Validation:
### TRUE Positive: Validation Data Label = TRUE;  Machine Learning Label Output = TRUE 
## FALSE Positive: Validation Data Label = FALSE; Machine Learning Label Output = TRUE 
### TRUE Negative: Validation Data Label = FALSE; Machine Learning Label Output = FALSE 
## FALSE Negative: Validation Data Label = TRUE;  Machine Learning Label Output = FALSE 
## ^^^used for measuring the Model's Precision, Accuracy, Recall while validating the Model quality

In [4]:
## Step 2: Collect & Split Collected Dataset; Clean & Normalize Exploratory Dataset
#####################################################
## Feature: Text
## Instance: label
## Label: True/False
df = pd.read_csv('spam-labels.csv', encoding='ISO-8859-1')
df = df[['spam', 'text']] # label = label
df['spam'] = df['spam'].apply(lambda row: True if row == 'spam' else False)
df['text'] = df['text'].apply(lambda row: row.lower().translate(str.maketrans('', '', string.punctuation)))
##############################
## Split the Collected Dataset:
X = df.loc[:, 'text'].values
print('Collected Dataset shape (X): {0} (instances, features)'.format(X.shape))
y = df.loc[:, 'spam'].values.astype(np.float32).ravel() # Outcome = 1/0; Success/Failure
print('Collected Dataset Labels shape (y): {0} (labels)'.format(y.shape))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) # From Module: sklearn.model_selection; test_size = % allocated to Validation Dataset
# #####################################################
# #### 85-90% of Collected Dataset: Exploratory Dataset
# #####################################################
# ###### Clean and Normalize Exploratory Dataset
########################################################
######## 85-90% of Exploratory Dataset: Training Dataset
print('Exploratory Dataset: Training Dataset Features shape (X_train): {0} (instances, features)'.format(X_train.shape))
X_train_df = pd.DataFrame(X_train, columns = ['text'])
print('Exploratory Dataset: Training Dataset Labels shape (y_train): {0} (labels)'.format(y_train.shape))
y_train_df = pd.DataFrame(y_train, columns = ['spam'])
training_dataset = pd.concat([y_train_df, X_train_df], axis=1)
training_dataset_label_true = training_dataset[training_dataset['spam'] == 1.0]
training_dataset_label_false = training_dataset[training_dataset['spam'] == 0.0]
#######################################################
## Clean & Normalize Exploratory Dataset: (label = TRUE)
training_dataset_label_true_list = training_dataset_label_true['text'].astype(str).tolist() # Turn DataFrame to List
training_dataset_label_true_string = ''.join(training_dataset_label_true_list) # Combine List Values into a Single String
# print(training_dataset_label_true_string)
training_dataset_label_true_string = training_dataset_label_true_string.translate(str.maketrans('', '', string.punctuation)) ## Remove Punctuation Characters
training_dataset_label_true_string = training_dataset_label_true_string.replace('\n','') ## Remove Line Breaks
training_dataset_label_true_string = re.sub(r'[0-9]+', '', training_dataset_label_true_string) ## Remove Numerical Characters
training_dataset_label_true_string = training_dataset_label_true_string.lower() ## Lowercase Characters
stop_words = set(stopwords.words('english'))
training_dataset_label_true_string = word_tokenize(training_dataset_label_true_string)
training_dataset_label_true_string = [word for word in training_dataset_label_true_string if word not in stop_words] ## Remove Stop words
training_dataset_label_true_string = ' '.join(training_dataset_label_true_string)
########################################################
## Clean & Normalize Exploratory Dataset: (label = FALSE)
training_dataset_label_false_list = training_dataset_label_false['text'].astype(str).tolist() # Turn DataFrame to List
training_dataset_label_false_string = ''.join(training_dataset_label_false_list) # Combine List Values into a Single String
training_dataset_label_false_string = training_dataset_label_false_string.translate(str.maketrans('', '', string.punctuation)) ## Remove Punctuation Characters
training_dataset_label_false_string = training_dataset_label_false_string.replace('\n','') ## Remove Line Breaks
training_dataset_label_false_string = re.sub(r'[0-9]+', '', training_dataset_label_false_string) ## Remove Numerical Characters
training_dataset_label_false_string = training_dataset_label_false_string.lower() ## Lowercase Characters
stop_words = set(stopwords.words('english'))
training_dataset_label_false_string = word_tokenize(training_dataset_label_false_string)
training_dataset_label_false_string = [word for word in training_dataset_label_false_string if word not in stop_words] ## Remove Stop words
training_dataset_label_false_string = ' '.join(training_dataset_label_false_string)
############################################
## Get a List of All Text where spam == True
training_dataset_label_true_word_list = training_dataset_label_true_string.split()
#############################################
## Get a List of All Text where spam == False
training_dataset_label_false_word_list = training_dataset_label_false_string.split()
#############################
## Get a List of Common Words where spam == True AND spam == False
training_dataset_common_word_list = set(training_dataset_label_true_word_list).intersection(set(training_dataset_label_false_word_list))
#########################################################
######## 10-15% of Exploratory Dataset: Debugging Dataset
#########################################################
######## 10-15% of Exploratory Dataset: Validation Dataset
print('Exploratory Dataset: Validation Dataset Features shape (X_test): {0} (instances, features)'.format(X_test.shape))
X_test_df = pd.DataFrame(X_test, columns = ['text'])
print('Exploratory Dataset: Validation Dataset Labels shape (y_test): {0} (labels)'.format(y_test.shape))
y_test_df = pd.DataFrame(y_test, columns = ['spam'])
#########################################################
###### 10-15% of Collected Dataset: Final Testing Dataset
#########################################################
print('\nTraining Dataset: (Exploratory Dataset)')
y_train_df = pd.DataFrame(y_train, columns = ['spam'])
percent_training_dataset_text_labeled_as_label_true = y_train_df['spam'].mean()
print('% of Training Dataset Instances Labeled as "label": {0}%'.format(percent_training_dataset_text_labeled_as_label_true * 100))
training_dataset.head(10)

Collected Dataset shape (X): (5572,) (instances, features)
Collected Dataset Labels shape (y): (5572,) (labels)
Exploratory Dataset: Training Dataset Features shape (X_train): (4736,) (instances, features)
Exploratory Dataset: Training Dataset Labels shape (y_train): (4736,) (labels)
Exploratory Dataset: Validation Dataset Features shape (X_test): (836,) (instances, features)
Exploratory Dataset: Validation Dataset Labels shape (y_test): (836,) (labels)

Training Dataset: (Exploratory Dataset)
% of Training Dataset Instances Labeled as "label": 13.492397964000702%


Unnamed: 0,spam,text
0,0.0,we regret to inform u that the nhs has made a mistakeu were never actually bornplease report 2 yor local hospital 2b terminatedwe r sorry 4 the inconvenience
1,0.0,4 tacos 1 rajas burrito right
2,0.0,alright babe
3,0.0,dear umma she called me now
4,0.0,well theres a pattern emerging of my friends telling me to drive up and come smoke with them and then telling me that im a weed fiendmake them smoke too muchimpede their doing other things so you see how im hesitant
5,0.0,after completed degree there is no use in joining finance
6,0.0,k ill take care of it
7,1.0,interflora åòits not too late to order interflora flowers for christmas call 0800 505060 to place your order before midnight tomorrow
8,0.0,sfine anytime all the best with it
9,0.0,will purchase d stuff today and mail to you do you have a po box number


In [5]:
## Step 3: Train Model using the Exploratory Dataset: Training Dataset
#################################
####### Epoch: total number of Instances in the Training Dataset
## Batch Size: total number of Instances from the Training Dataset per Batch
## Iterations: total number of Batches needed to iterate through the Training Dataset (Epoch)
#############################################################################################
## Train Model using the Training Dataset (Exploratory Dataset) (Epoch)
####################################################
## Create "Bag of Words" Dictionary
probability_label_true_list = []
for common_word in training_dataset_common_word_list:
    probability_label_true = training_dataset_label_true_word_list.count(common_word) / len(training_dataset_label_true_word_list)
    probability_label_true_list.append(probability_label_true)
####################################################
## Create "Bag of Words" Dictionary
probability_label_false_list = []
for common_word in training_dataset_common_word_list:
    probability_label_false = training_dataset_label_false_word_list.count(common_word) / len(training_dataset_label_false_word_list)
    probability_label_false_list.append(probability_label_false)
word_df = pd.DataFrame(training_dataset_common_word_list, columns=['word'])
prob_label_true_df = pd.DataFrame(probability_label_true_list, columns=['prob spam = true'])
prob_label_false_df = pd.DataFrame(probability_label_false_list, columns=['prob spam = false'])
label_probabilities_df = pd.concat([word_df, prob_label_true_df, prob_label_false_df], axis=1)
label_probabilities_df.sort_values(['prob spam = true'], ascending=False).head(10)

Unnamed: 0,word,prob spam = true,prob spam = false
352,call,0.031575,0.005776
741,å£,0.024374,0.000128
629,free,0.017948,0.001276
298,txt,0.01407,0.000383
216,u,0.013073,0.025176
159,ur,0.012076,0.006733
382,mobile,0.011522,0.000415
685,text,0.010747,0.001787
626,stop,0.00975,0.000862
470,reply,0.009417,0.00083


In [6]:
## Step 4: Debug & Tune Model
## Debug & Tune Model: Validate Model using the Debugging Dataset
#### Review Machine Learning Label Output vs Debugging Dataset Label
#### IF inspired THEN fix issues (dataset, hyperparameters, etc.)
testing = True
text = 'youre so close to winning free internet for a year'
predictions_df = predict_text_label_via_bag_of_words(text.split(), label_probabilities_df, percent_training_dataset_text_labeled_as_label_true, testing)
predictions_df

    word  prob spam = true  prob spam = false
0  youre          0.000222           0.001276
1  close          0.000443           0.000255
2   free          0.017948           0.001276
3   year          0.000554           0.001053
Spam = True Score: -29.65802642220085
Spam = False Score: -28.601704308913586
Text "youre so close to winning free internet for a year" is Spam: False


Unnamed: 0,text,spam,spam true score,spam false score
0,youre so close to winning free internet for a year,False,-29.658026,-28.601704


In [7]:
## Step 5: Validate Model
## Validate Model using the Validation Dataset
## Review Accuracy, Precision, Recall Metrics
## Do NOT review Review Machine Learning Label Output vs Debugging Dataset Label
testing = False
X_test_df = pd.DataFrame(X_test, columns=['text'])
X_test_list = X_test_df['text'].to_list()

validation_predictions_df = pd.DataFrame()
for text in X_test_list:
    predictions_df = predict_text_label_via_bag_of_words(text.split(), label_probabilities_df, percent_training_dataset_text_labeled_as_label_true, testing)
    validation_predictions_df = validation_predictions_df.append(predictions_df)

validation_predictions_df = validation_predictions_df.drop(columns=['spam true score','spam false score'])
validation_predictions_df.reset_index(drop=True, inplace=True)
validation_predictions_df.head()
    
validation_dataset = pd.concat([X_test_df, y_test_df], axis=1)
label_boolean_list = []
for index, row in validation_dataset.iterrows():
    spam = row['spam']
    if spam == 1.0:
        label_boolean_list.append(True)
    else:
        label_boolean_list.append(False)

validation_dataset = validation_dataset.drop(columns=['spam'])
label_boolean_df = pd.DataFrame(label_boolean_list, columns=['spam'])
validation_dataset = pd.concat([validation_dataset, label_boolean_df], axis=1)

label_prediction_true_df = validation_predictions_df[(validation_predictions_df['spam'] == True)]
label_prediction_false_df = validation_predictions_df[(validation_predictions_df['spam'] == False)]
validation_dataset_true_df = validation_dataset[(validation_dataset['spam'] == True)]
validation_dataset_false_df = validation_dataset[(validation_dataset['spam'] == False)]

validation_predictions_df.compare(validation_dataset, keep_equal=False)

true_positives_df = pd.merge(validation_dataset_true_df, label_prediction_true_df, how='left', on='text') # text = spam; prediction = spam
false_positives_df = pd.merge(label_prediction_true_df, validation_dataset_false_df, how='left', on='text') # text = spam; prediction = spam
true_negatives_df = pd.merge(validation_dataset_false_df, label_prediction_false_df, how='left', on='text') # text = spam; prediction = spam
false_negatives_df = pd.merge(label_prediction_false_df, validation_dataset_true_df, how='left', on='text') # text = spam; prediction = spam

true_positives_count = true_positives_df[(true_positives_df['spam_y'] == True)]['spam_y'].count()
false_positives_count = false_positives_df[(false_positives_df['spam_y'] == False)]['spam_y'].count()
true_negatives_count = true_negatives_df[(true_negatives_df['spam_y'] == False)]['spam_y'].count()
false_negatives_count = false_negatives_df[(false_negatives_df['spam_y'] == True)]['spam_y'].count()

print('TP: {0}'.format(true_positives_count))
print('FP: {0}'.format(false_positives_count))
print('TN: {0}'.format(true_negatives_count))
print('FP: {0}'.format(false_negatives_count))

accuracy = true_positives_count / (true_positives_count + false_positives_count + true_negatives_count + false_negatives_count)
precision = true_positives_count / (true_positives_count + false_positives_count)
recall = true_positives_count / (true_positives_count + false_negatives_count)
print('\nSpam Detection Accuracy: {0}%'.format(accuracy)) # Accuracy = TP / P
print('Spam Detection Precision: {0}%'.format(precision)) # Precision = TP / (TP + FP)
print('Spam Detection Recall: {0}%'.format(recall)) # Recall = TP / (TP + FN)

TP: 77
FP: 31
TN: 639
FP: 5

Spam Detection Accuracy: 0.1023936170212766%
Spam Detection Precision: 0.7129629629629629%
Spam Detection Recall: 0.9390243902439024%


In [8]:
## Step 6: Final Test on Model
## Test Model using the Final Testing Dataset
## Review Machine Learning Label Output vs Debugging Dataset Label
## Review Accuracy, Precision, Recall Metrics
## If the Model Fails, start over

In [9]:
## Step 7: Implementation & User Testing
## Does this model positively impact key metrics?

In [10]:
## Step 8: Tech Debt
## Continue to retest the Model
## (if applicable) Update the Model