In [3]:
#Import section 

import pandas as pd

In [4]:
#1 Data import & Data Preprocessing

#  The data file is available in the same folder, take a look into the dataset before we start with the preporcessing steps.
#  **read_table** is depricated and **read_csv** can be used to achieve the same.
#  The file is tab delimeted
#  **names** is used to name the columns

df = pd.read_csv("SMSSpamCollection", 
                 sep='\t',
                 skipinitialspace=True,
                 names=['label','sms_message'])



In [6]:
#Dimentions of the dataset
print("The dim of dataset is ",  df.shape)

#print first 6 values in the dataset 
df.head(6)

The dim of dataset is  (5572, 2)


Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...


In [7]:
#For the ease of computation lets convert our lables to binary values, i.e,
# ham -> 0
# spam -> 1

#Add "skipinitialspace=True" in *read_csv* if your lable shows up as 'NaN'

df['label'] = df.label.map({'ham':0,'spam':1})

#print first 5 values to see if the labels are replaced
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#2. [Exercise] BoW (Bag of words) - Without scikitlearn

# Functions - > tolowercase, stop_words, token_pattern (hello! or hello) 

# 2.1. All docs to lower 

documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []

for doc in documents:
    lower_case_documents.append(doc.lower())


print(lower_case_documents)


['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [9]:
# 2.2. Removing all punctuations

import string

sans_case_documents = []

for doc in lower_case_documents:
    sans_case_documents.append(doc.translate(str.maketrans('', '', string.punctuation))) 
    
print(lower_case_documents)
print(sans_case_documents)



['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']
['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [10]:
# 2.3. Tokenization

preprocessed_documents = []
for doc in sans_case_documents:
    preprocessed_documents.append(doc.split())
print(preprocessed_documents)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [11]:
# 2.4. Frequency count the occurrence of each word in each document of the document set.

#We will use the `Counter` method from the Python `collections` library for this purpose.

import pprint
from collections import Counter

frequency_list = []

for doc in preprocessed_documents:
    frequency_counter = Counter(doc)
    frequency_list.append(frequency_counter)

pprint.pprint(frequency_list)


[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [12]:
#2. BoW (Bag of words) - Using scikitlearn

from sklearn.feature_extraction.text import CountVectorizer 

#Creating an instance of CountVectorizer
#Few default values are tolowercase, token pattern to ignore punctuations, ignore english stopwords

count_vector_forDoc = CountVectorizer()


print(count_vector_forDoc)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [13]:
count_vector_forDoc.fit(documents)

print(documents)

count_vector_forDoc.get_feature_names()
# The `get_feature_names()` method returns our feature names for this dataset, which is the set of words that make up our vocabulary for 'documents'.

['Hello, how are you!', 'Win money, win from home.', 'Call me now.', 'Hello, Call hello you tomorrow?']


['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [14]:
#Create a matrix with the rows being each of the 4 documents, and the columns being each word.
#The corresponding (row, column) value is the frequency of occurrence of that word(in the column) in a particular document(in the row).

doc_array = count_vector_forDoc.transform(documents).toarray()

doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [15]:
freq_matrix = pd.DataFrame(
    doc_array,
    columns=count_vector_forDoc.get_feature_names())

freq_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [16]:
#End of 2. to understand BoW. lets get back to our original dataset


>>**# 3**
Split the dataset into a training and testing set by using the train_test_split method in sklearn. Split the data
using the following variables:
* `X_train` is our training data for the 'sms_message' column.
* `y_train` is our training data for the 'label' column
* `X_test` is our testing data for the 'sms_message' column.
* `y_test` is our testing data for the 'label' column
Print out the number of rows we have in each our training and testing data.

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split( df['sms_message'],
                                                     df['label'],
                                                     random_state= 1)


print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))


Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [35]:
#3.2  Applying  bag of words 

# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

#Fit derives the features out of X_train.
#transform counts the occurance of words in each row.
training_data = count_vector.fit_transform(X_train)


#Viewing transformed frequency matrix for one record
training_data_frequency_matrix = pd.DataFrame(
    training_data[1].toarray(),
    columns=count_vector.get_feature_names())

training_data_frequency_matrix

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
# The features are derived using "training_data" and the count is done based on training data and not based on frequency of "testing_data"
testing_data = count_vector.transform(X_test)



##Step 4: Naive Bayes implementation using scikit-learn##

sklearn has several Naive Bayes implementations that we can use and so we do not have to do the math from scratch. We will be using sklearns sklearn.naive_bayes method to make predictions on our dataset.

Specifically, we will be using the multinomial Naive Bayes implementation. This particular classifier is suitable for classification with discrete features (such as in our case, word counts for text classification). It takes in integer word counts as its input. On the other hand Gaussian Naive Bayes is better suited for continuous data as it assumes that the input data has a Gaussian(normal) distribution.

In [34]:
# Prior(the probabilities that we are aware of or that is given to us)
# Posterior(the probabilities we are looking to compute using the priors).

'''
We have loaded the training data into the variable 'training_data' and the testing data into the 
variable 'testing_data'.

Import the MultinomialNB classifier and fit the training data into the classifier using fit(). Name your classifier
'naive_bayes'. You will be training the classifier using 'training_data' and y_train' from our split earlier. 
'''


from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, Y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [43]:
'''
Now that our algorithm has been trained using the training data set we can now make some predictions on the test data
stored in 'testing_data' using predict(). Save your predictions into the 'predictions' variable.
'''


predictions = naive_bayes.predict(testing_data)



In [56]:
'''
Compute the accuracy, precision, recall and F1 scores of your model using your test data 'y_test' and the predictions
you made earlier stored in the 'predictions' variable.
'''

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

#accuracy = (right predictions) / (all samples)
print('Accuracy score: ', format(accuracy_score(Y_test, predictions)))


#Precision = (true positives) / (true positives + false positives)
print('Precision score: ', format(precision_score(Y_test, predictions)))

#Recall score aka 'True positive rate'
#Recall = (true positives) / (true positives + false negatives)
print('Recall score: ', format(recall_score(Y_test, predictions)))

#Weighted average for precision and recall
#F1 = 2 * (precision * recall) / (precision + recall)
print('F1 score: ', format(f1_score(Y_test, predictions)))




print('\n\n----confusion_matrix----\n(true positive) (false negative)')
print('(false positive) (true negative)')
print('',confusion_matrix(Y_test,predictions))


Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562


----confusion_matrix----
(true positive) (false negative)
(false positive) (true negative)
 [[1203    5]
 [  11  174]]
