# 4. Assuming a set of documents that need to be classified, use the naïve Bayesian classifier model to perform this task. Built-in classes /API can be used to write the program. Calculate the accuracy precision and recall for your data set.

In [103]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [104]:
df = pd.read_csv('./text_classification.csv')

In [105]:
X, Y = df['Text'], df['label']

x_train, x_test, y_train, y_test = train_test_split(X, Y, shuffle=False)

In [106]:
vocabulary = []
for row in range(x_train.shape[0]):
    vocabulary.extend(x_train.iloc[row].lower().split())
vocabulary = sorted(list(set(vocabulary)))

In [107]:
probPos = y_train[y_train=='pos'].count() / y_train.count()
probNeg = y_train[y_train=='neg'].count() / y_train.count()

In [108]:
textPos = []
for t in x_train[y_train=='pos']:
    textPos.extend(t.lower().split())
textPos = sorted(textPos)
nPos = len(set(textPos))

In [109]:
textNeg = []
for t in x_train[y_train=='neg']:
    textNeg.extend(t.lower().split())
textNeg = sorted(textNeg)
nNeg = len(set(textNeg))

In [110]:
wordProbs = {}
for word in vocabulary:
    positive = (textPos.count(word)+1) / (nPos + len(vocabulary))
    negative = (textNeg.count(word)+1) / (nNeg + len(vocabulary))
    wordProbs.update({word: [positive, negative]})

In [111]:
estimate = []

for row in range(x_test.shape[0]):
    vPositive = probPos
    vNegative = probNeg
    for word in x_test.iloc[row].lower().split():
        if word not in vocabulary:
            continue
        vPositive *= wordProbs[word][0]
        vNegative *= wordProbs[word][1]
    estimate.append('pos' if vPositive >= vNegative else 'neg')

In [113]:
confusion_matrix = pd.DataFrame([[0, 0], [0, 0]], columns=['neg', 'pos'], index=['neg', 'pos'])

for i in range(y_test.shape[0]):
    confusion_matrix.loc[y_test.iloc[i], estimate[i]] += 1

In [114]:
confusion_matrix

Unnamed: 0,neg,pos
neg,1,2
pos,0,2


In [116]:
tn, fp, fn, tp = confusion_matrix.loc['neg', 'neg'], confusion_matrix.loc['neg', 'pos'], \
                confusion_matrix.loc['pos', 'neg'], confusion_matrix.loc['pos', 'pos']

In [117]:
precision = tp / (tp + fp)
recall = tn / (tp + fn)
accuracy = (tp + tn) / (tp + tn + fp + fn)

In [118]:
print("Precision: ", precision)
print("Recall: ", recall)
print("Accuracy: ", accuracy)

Precision:  0.5
Recall:  0.5
Accuracy:  0.6


-----

In [94]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

msg = pd.read_csv('./text_classification.csv')
msg['lablenum'] = msg.label.map({'pos':1, 'neg':0})
X = msg.Text
Y = msg.lablenum

x_train, x_test, y_train, y_test = train_test_split(X, Y)
cv = CountVectorizer()
xtrain_dtm = cv.fit_transform(x_train)
xtest_dtm = cv.transform(x_test)
print(cv.get_feature_names_out())
data = pd.DataFrame(xtrain_dtm.toarray(), columns=cv.get_feature_names_out())

['about' 'am' 'amazing' 'an' 'and' 'awesome' 'bad' 'beers' 'best' 'boss'
 'dance' 'do' 'donot' 'enemy' 'feel' 'good' 'great' 'he' 'holiday'
 'horrible' 'is' 'juice' 'like' 'locality' 'love' 'my' 'not' 'of' 'place'
 'restaurant' 'sick' 'stay' 'stuff' 'sworn' 'taste' 'that' 'the' 'these'
 'this' 'tired' 'to' 'very' 'what' 'work']


In [95]:
clf = MultinomialNB().fit(xtrain_dtm, y_train)
predicted = clf.predict(xtest_dtm)

In [96]:
metrics.accuracy_score(y_test, predicted)

1.0

In [97]:
metrics.confusion_matrix(y_test, predicted)

array([[2, 0],
       [0, 3]])

In [98]:
metrics.precision_score(y_test, predicted)

1.0

In [99]:
metrics.recall_score(y_test, predicted)

1.0