In [4]:
import string
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import pprint
from collections import Counter
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_table('smsspamcollection/SMSSpamCollection', 
                   header=None,
                  names=['label', 'message'])

In [16]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.shape

(5572, 2)

In [10]:
df.isna().sum()

class      0
message    0
dtype: int64

In [11]:
df.message.nunique()

5169

In [12]:
df.message.value_counts()

Sorry, I'll call later                                                                                                                                         30
I cant pick the phone right now. Pls send a message                                                                                                            12
Ok...                                                                                                                                                          10
Okie                                                                                                                                                            4
Your opinion about me? 1. Over 2. Jada 3. Kusruthi 4. Lovable 5. Silent 6. Spl character 7. Not matured 8. Stylish 9. Simple Pls reply..                        4
                                                                                                                                                               ..
No. On the way home. So if n

In [19]:
# numerical labels
df['label'] = df.label.map({'ham':0, 'spam':1} )

In [19]:
df.label

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int64

# Bag of words

In [22]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

In [24]:
lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [28]:
sans_punctuation_documents = []

for i in lower_case_documents:
    sans_punctuation_documents.append(i.translate(str.maketrans('', '', string.punctuation)))
print(sans_punctuation_documents)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [30]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split())
print(preprocessed_documents)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [36]:
frequency_list = []
for i in preprocessed_documents:
    frequency_list.append(Counter(i))
    
pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [35]:
print(Counter(['a', 'b', 'c', 'a', 'b', 'b'])) 

Counter({'b': 3, 'a': 2, 'c': 1})


In [54]:
# scikit-learn BoW
count_vector = CountVectorizer(stop_words='english')
# print(count_vector)
count_vector.fit(documents)
count_vector.get_feature_names()

['hello', 'home', 'money', 'tomorrow', 'win']

In [55]:
doc_array = count_vector.transform(documents)
doc_array = doc_array.toarray()

In [56]:
# convert to dataframe
frequency_matrix = pd.DataFrame(doc_array, columns=count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,hello,home,money,tomorrow,win
0,1,0,0,0,0
1,0,1,1,0,2
2,0,0,0,0,0
3,2,0,0,1,0


In [50]:
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [51]:
doc_array

<4x12 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], 
                                                    df['label'], 
                                                    random_state=101)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [8]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

# Bases Rule

In [None]:
# manual

In [67]:
P_D = 0.01
P_nD = 0.99
P_Pos_D = 0.9
P_Neg_nD = 0.9

# calculate P_Pos
# P_D_Pos = P_D * P_Pos_D / P_Pos
P_Pos = P_D * P_Pos_D + P_nD * (1-P_Neg_nD)
print(f'P(pos)={P_Pos}')

P(pos)=0.10799999999999998


In [1]:
# Scikit learn

In [2]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
# discrete: word count
naive_bayes = MultinomialNB()
naive_bayes.fit(X=training_data, y=y_train)

MultinomialNB()

In [22]:
predictions = naive_bayes.predict(testing_data)

In [15]:
# evaluation

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [17]:
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))

Accuracy score:  0.9849246231155779


In [23]:
print('Precision score: ', format(precision_score(y_test, predictions)))

Precision score:  0.9865771812080537


In [24]:
print('Recall score: ', format(recall_score(y_test, predictions)))

Recall score:  0.8855421686746988


In [25]:
print('F1 score: ', format(f1_score(y_test, predictions)))

F1 score:  0.9333333333333333
