In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import string

In [24]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [25]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [26]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
# change ham/ spam -> 1/0
df['v1'] = df['v1'].apply(lambda x: 1 if x == 'spam' else 0)

In [28]:
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [32]:
def text_preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word.lower() for word in text.split()]
    return " ".join(text)

In [33]:
df['v2'] = df['v2'].apply(lambda x: text_preprocess(x))

In [34]:
df.head()

Unnamed: 0,v1,v2
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [41]:
# split to train and test
X_train, X_test, Y_train, Y_test = train_test_split(
    df['v2'],
    df['v1'],
    test_size=0.2,
    random_state=1
)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4457
Number of rows in the test set: 1115


Frequency distribution
Our objective here is to convert this set of text to a frequency distribution matrix


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [43]:
X_train, Y_train

(1642                             sleeping nt feeling well
 2899     come aftr ltdecimalgt now i m cleaning the house
 480                           almost there see u in a sec
 3485                      yeah probably earlier than that
 157     hello my love what are you doing did you get t...
                               ...                        
 905       hey whats up charles sorry about the late reply
 5192    oh oh den muz change plan liao go back have to...
 3980    huh i cant thk of more oredi how many pages do...
 235            i have printed it oh so ltgt come upstairs
 5157                                 k k sms chat with me
 Name: v2, Length: 4457, dtype: object,
 1642    0
 2899    0
 480     0
 3485    0
 157     0
        ..
 905     0
 5192    0
 3980    0
 235     0
 5157    0
 Name: v1, Length: 4457, dtype: int64)

In [44]:
training_data = cv.fit_transform(X_train).toarray()

testing_data = cv.transform(X_test).toarray()

In [48]:
frequency_matrix = pd.DataFrame(
    training_data,
    columns=cv.get_feature_names_out()
)

frequency_matrix.head()

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,02070836089,02072069400,...,åômorrow,åôrents,ìll,ìï,ìïll,ûthanks,ûªve,ûïharry,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Train the data
clf = LogisticRegression(random_state=0)

clf.fit(training_data, Y_train)

In [50]:
# make predictions
y_pred = clf.predict(testing_data)

In [51]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(1115,))

In [52]:
print('Accuracy score: ', format(accuracy_score(Y_test, y_pred)))
print('Precision score: ', format(precision_score(Y_test, y_pred)))
print('Recall score: ', format(recall_score(Y_test, y_pred)))
print('F1 score: ', format(f1_score(Y_test, y_pred)))
print('\nConfusion Matrix :\n', confusion_matrix(Y_test, y_pred))

Accuracy score:  0.9856502242152466
Precision score:  0.9767441860465116
Recall score:  0.9064748201438849
F1 score:  0.9402985074626866

Confusion Matrix :
 [[973   3]
 [ 13 126]]
