In [76]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix 

In [77]:
# Reading Message Data
dataset = pd.read_csv("./SpamMessageData.csv", sep='\t',
                      names=['label', 'message'])

dataset['label'] = dataset.label.map({'ham':0, 'spam':1})
dataset.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [78]:
# Spliting Dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['message'], 
                                                    dataset['label'], 
                                                    test_size =0.2, 
                                                    random_state=1)


print('Number of rows in the total set: {}'.format(dataset.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4457
Number of rows in the test set: 1115


In [79]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

In [80]:
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train).toarray()

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test).toarray()

In [81]:
frequency_matrix = pd.DataFrame(training_data, 
                                columns = count_vector.get_feature_names())
frequency_matrix.head()

Unnamed: 0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,0207,02072069400,...,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada,èn,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
testing_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [83]:
clf = LogisticRegression(random_state=0).fit(training_data, y_train)

In [84]:
pred = clf.predict(testing_data)

In [85]:
print('Accuracy score: ', format(accuracy_score(y_test, pred)))
print('Precision score: ', format(precision_score(y_test, pred)))
print('Recall score: ', format(recall_score(y_test, pred)))
print('F1 score: ', format(f1_score(y_test, pred)))
print('\nConfusion Matrix :\n', confusion_matrix(y_test, pred)) 

Accuracy score:  0.989237668161435
Precision score:  0.9927007299270073
Recall score:  0.9251700680272109
F1 score:  0.9577464788732395

Confusion Matrix :
 [[967   1]
 [ 11 136]]
