# Naive Bayesian Classifier Model

Assuming a set of documents that need to be classified, use the naive Bayesian classifier model to perform this task. Built-in Java classes/API can be used to write the program. Calculate the accuracy, precision, and recall for your data set. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
msg = pd.read_csv("naivetext.csv",names = ['message','label'])
msg.head()

Unnamed: 0,message,label
0,I love this sandwich,pos
1,This is an amazing place,pos
2,I feel very good about these beers,pos
3,This is my best work,pos
4,What an awesome view,pos


In [4]:
msg['labelnum'] = msg.label.map({'pos':1,'neg':0})
msg.tail()

Unnamed: 0,message,label,labelnum
13,I am sick and tired of this place,neg,0
14,What a great holiday,pos,1
15,That is a bad locality to stay,neg,0
16,We will have good fun tomorrow,pos,1
17,I went to my enemy's house today,neg,0


In [5]:
x = msg.message
x

0                      I love this sandwich
1                  This is an amazing place
2        I feel very good about these beers
3                      This is my best work
4                      What an awesome view
5             I do not like this restaurant
6                  I am tired of this stuff
7                    I can't deal with this
8                      He is my sworn enemy
9                       My boss is horrible
10                 This is an awesome place
11    I do not like the taste of this juice
12                          I love to dance
13        I am sick and tired of this place
14                     What a great holiday
15           That is a bad locality to stay
16           We will have good fun tomorrow
17         I went to my enemy's house today
Name: message, dtype: object

In [6]:
y = msg.labelnum
y

0     1
1     1
2     1
3     1
4     1
5     0
6     0
7     0
8     0
9     0
10    1
11    0
12    1
13    0
14    1
15    0
16    1
17    0
Name: labelnum, dtype: int64

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.25)

In [8]:
count_vect = CountVectorizer()

xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm = count_vect.transform(xtest)
print(count_vect.get_feature_names())

['about', 'amazing', 'an', 'awesome', 'bad', 'beers', 'boss', 'can', 'deal', 'do', 'enemy', 'feel', 'fun', 'good', 'great', 'have', 'he', 'holiday', 'horrible', 'house', 'is', 'juice', 'like', 'locality', 'love', 'my', 'not', 'of', 'place', 'restaurant', 'sandwich', 'stay', 'sworn', 'taste', 'that', 'the', 'these', 'this', 'to', 'today', 'tomorrow', 'very', 'view', 'we', 'went', 'what', 'will', 'with']


In [9]:
df = pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names())
df.head()

Unnamed: 0,about,amazing,an,awesome,bad,beers,boss,can,deal,do,...,to,today,tomorrow,very,view,we,went,what,will,with
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
mclf = MultinomialNB().fit(xtrain_dtm, ytrain)
predicted = mclf.predict(xtest_dtm)
predicted

array([0, 0, 0, 1, 0])

In [17]:
print("Confucion matrix")
print(metrics.confusion_matrix(ytest,predicted))
print("Accuracy: ",metrics.accuracy_score(ytest,predicted))
print("Precision: ",metrics.precision_score(ytest,predicted))

newText_dtm = count_vect.transform(newText)

newText_predicted = mclf.predict(newText_dtm)

newText_predicted
print("Recall: ",metrics.recall_score(ytest,predicted))

Confucion matrix
[[2 0]
 [2 1]]
Accuracy:  0.6
Precision:  1.0
Recall:  0.3333333333333333


In [14]:
newText = ["my boss is best"]

In [15]:
newText_dtm = count_vect.transform(newText)
newText_predicted = mclf.predict(newText_dtm)
newText_predicted

array([0])