# Naive Bayes Classifier

In [33]:
import pandas as pd

### Import data

In [34]:
data = pd.read_csv('preprocessed_data.csv')
data = data[['text_lemmatized','sentiment']]
data = data.dropna()
data

Unnamed: 0,text_lemmatized,sentiment
0,switchfoot httptwitpiccom2y1zl awww bummer sho...,0
1,upset cannot update facebook texting might cry...,0
2,kenichan dived many time ball managed save 50 ...,0
3,whole body feel itchy like fire,0
4,nationwideclass behaving mad cannot see,0
...,...,...
1599995,woke school best feeling ever,1
1599996,thewdbcom cool hear old walt interview ♫ httpb...,1
1599997,ready mojo makeover ask detail,1
1599998,happy 38th birthday boo alll time tupac amaru ...,1


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import nltk

### TF-IDF

In [36]:
X_train, X_test, y_train, y_test = train_test_split(data["text_lemmatized"],data["sentiment"],test_size=0.2,shuffle=True)

In [37]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

### Naive Bayes Classifier

In [38]:
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train) 

In [39]:
y_predict = nb_tfidf.predict(X_test_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.74      0.80      0.77    160037
           1       0.78      0.72      0.75    159889

    accuracy                           0.76    319926
   macro avg       0.76      0.76      0.76    319926
weighted avg       0.76      0.76      0.76    319926

Confusion Matrix: [[127889  32148]
 [ 44020 115869]]
AUC: 0.8463543682694415


In [40]:
test_cases = X_test.to_frame()
test_cases


Unnamed: 0,text_lemmatized
787779,going school cannot find cellphone tweet ya ge...
619184,aaaaah ready say goodbye maybe check earlier v...
1517648,gotta study english exam wait cupple hour
1134075,bobhinkle thats awesome detect bit proud daddy
688781,ranijoshi want pav bhaji umm u supposed invite...
...,...
1212924,managing twitter account newbie
1283338,maggiechicken sure fine well long wrote someth...
383731,studying examz
972510,omg mom made sum peanut butter cookie today co...


In [41]:
test_tfidf = tfidf_vectorizer.transform(test_cases['text_lemmatized'])
prediction = nb_tfidf.predict(test_tfidf)
prediction_prob = nb_tfidf.predict_proba(test_tfidf)[:,1]

test_cases['predict_prob'] = prediction_prob
test_cases['predicted sentiment'] = prediction

test_cases

Unnamed: 0,text_lemmatized,predict_prob,predicted sentiment
787779,going school cannot find cellphone tweet ya ge...,0.294888,0
619184,aaaaah ready say goodbye maybe check earlier v...,0.571556,1
1517648,gotta study english exam wait cupple hour,0.313116,0
1134075,bobhinkle thats awesome detect bit proud daddy,0.737557,1
688781,ranijoshi want pav bhaji umm u supposed invite...,0.624998,1
...,...,...,...
1212924,managing twitter account newbie,0.706753,1
1283338,maggiechicken sure fine well long wrote someth...,0.757351,1
383731,studying examz,0.229714,0
972510,omg mom made sum peanut butter cookie today co...,0.644264,1
