In [2]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table('smsspamcollection/SMSSpamCollection',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])
# Normalizing
df['label'] = df.label.map({'ham':0,'spam':1})

# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df_test = pd.read_table('smsspamcollection/SMSSpamCollection_test',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])

# Output printing out first 5 rows
print(df_test.head())
df_test['label'] = df_test.label.map({'ham':0, 'spam':1})
print("Normalizing testing data: SPAM=1, HAM=0")
print(df_test.head())
print("****************************************")
# Create custom test data
# data_custom= np.array(['WINNER As a valued network customer','How are you today?']) 
# X_test_custom = pd.Series(data_custom, index =[100, 101]) 
# data_custom_y= np.array([1, 1])   
# Y_test_custom = pd.Series(data_custom_y, index =[100, 101]) 
X_test_custom = df_test["sms_message"]
Y_test_custom = df_test["label"]
# split into training and testing sets
# USE from sklearn.model_selection import train_test_split to avoid seeing deprecation warning.

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'],
                                                    random_state=0)


# print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
# print('Number of rows in the test set: {}'.format(X_test.shape[0]))


# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix - creating the vocabulary
count_vector.fit(X_train)
# Creating the occurences matrix from of training data
training_data = count_vector.transform(X_train);
# Logging the occurences matrix 
frequency_matrix = pd.DataFrame(training_data.toarray(), columns = count_vector.get_feature_names())
print('********************** Training Data Occurences Matrix *****************************')
print(frequency_matrix)
print('***************************************************')
print('')

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test_custom)
frequency_matrix = pd.DataFrame(testing_data.toarray(), columns = count_vector.get_feature_names())

# Logging the testing data
print('************************ Testing Data ***************************')
print(X_test_custom)
print('***************************************************')

print('*************************Testing data initial assumption: 1 is SPAM, 0 is Not SPAM **************************')
print(Y_test_custom)
print('***************************************************')
print('')

# Creating the instance of naive bayes
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
# Predict the testing data with the current training data
predictions = naive_bayes.predict(testing_data)

print('*********************** Predictions: 1 is SPAM, 0 is Not SPAM****************************')
print(predictions)
for i, val in enumerate(predictions): 
    print('SPAM' if val == 1 else 'NOT SPAM')
    print(X_test_custom[i])
    print('----------------------------------------')
print('***************************************************')
print('')



print('*********************** Accuracy Model ****************************')
print('Accuracy score: ', format(accuracy_score(Y_test_custom, predictions)))
print('Precision score: ', format(precision_score(Y_test_custom, predictions)))
print('Recall score: ', format(recall_score(Y_test_custom, predictions)))
print('F1 score: ', format(f1_score(Y_test_custom, predictions)))



  label                                        sms_message
0   ham                      Ok lar... Joking wif u oni...
1  spam  Free entry in 2 a wkly comp to win FA Cup fina...
2   ham  U dun say so early hor... U c already then say...
3   ham  Nah I don't think he goes to usf, he lives aro...
4  spam  FreeMsg Hey there darling it's been 3 week's n...
Normalizing testing data: SPAM=1, HAM=0
   label                                        sms_message
0      0                      Ok lar... Joking wif u oni...
1      1  Free entry in 2 a wkly comp to win FA Cup fina...
2      0  U dun say so early hor... U c already then say...
3      0  Nah I don't think he goes to usf, he lives aro...
4      1  FreeMsg Hey there darling it's been 3 week's n...
****************************************
Number of rows in the training set: 4179
********************** Training Data Occurences Matrix *****************************
      00  000  000pes  008704050406  0089  0121  01223585334  0125698789  02  \