# Naive Bayes Classifier

In [1]:
import pandas as pd

### Import data

In [2]:
data = pd.read_csv('../data_cleaning/1000_dataset_cleaned.csv')
# data = data[['text_lemmatized','sentiment']]
data = data.dropna()
data

Unnamed: 0.1,Unnamed: 0,Tweet,opinion_2,sentiment_2
0,0,"Trying to move with a baby, having you mom liv...",-1,0
1,1,I honestly will probably never leave Washingto...,-1,0
2,2,"""Trying to move to another country is a humbli...",-1,0
3,3,Doubtful..move to another country,1,1
4,4,my brother is convinced that imma move to Toro...,1,1
...,...,...,...,...
1334,1334,the urge to move to another country and start ...,1,1
1335,1335,I have done this but I am not contented I need...,1,1
1336,1336,"the urge to fake your death, move to another c...",1,1
1337,1337,"That sudden urge to quit uni, move to another ...",1,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import nltk

## Subjectivity Detection

### Convert Text to TF-IDF

In [4]:
X = data['Tweet']
y = data['opinion_2']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [6]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) 

### Naive Bayes Classifier

In [7]:
nb_subj = MultinomialNB()
nb_subj.fit(X_train_tfidf, y_train) 

### Results on 1000 Evaluation Dataset

In [8]:
test_tfidf = tfidf_vectorizer.transform(X_test)
prediction_prob = nb_subj.predict_proba(test_tfidf)[:,1]
prediction = nb_subj.predict(test_tfidf)

In [9]:
test_case_ground_truth = y_test
test_case_ground_truth = np.array(test_case_ground_truth)
test_case_ground_truth

array([ 1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1, -1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1, -1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1,
        1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,
       -1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1

In [10]:
prediction

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [11]:
print("Subjectivity Detection Model Evaluation")
print("1: OPINIONATED")
print("-1: NEUTRAL")
print(classification_report(test_case_ground_truth,prediction))
print('Confusion Matrix:\n',confusion_matrix(test_case_ground_truth, prediction))

Subjectivity Detection Model Evaluation
1: OPINIONATED
-1: NEUTRAL
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        81
           1       0.76      1.00      0.86       254

    accuracy                           0.76       335
   macro avg       0.38      0.50      0.43       335
weighted avg       0.57      0.76      0.65       335

Confusion Matrix:
 [[  0  81]
 [  0 254]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Polarity Detection

In [12]:
data_polarity = data[data['sentiment_2']!=0]
data_polarity

Unnamed: 0.1,Unnamed: 0,Tweet,opinion_2,sentiment_2
3,3,Doubtful..move to another country,1,1
4,4,my brother is convinced that imma move to Toro...,1,1
5,5,The way I want to drop everything and move to ...,1,1
6,6,Uk is such bad vybes that if I wa &gt; 25 with...,1,1
7,7,I say I move to another country /j I cannot,1,1
...,...,...,...,...
1334,1334,the urge to move to another country and start ...,1,1
1335,1335,I have done this but I am not contented I need...,1,1
1336,1336,"the urge to fake your death, move to another c...",1,1
1337,1337,"That sudden urge to quit uni, move to another ...",1,1


In [13]:
X = data_polarity['Tweet']
y = data_polarity['sentiment_2']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

#### Convert data to TFIDF

In [15]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

### Naive Bayes Classifier

In [16]:
nb_pol = MultinomialNB()
nb_pol.fit(X_train_tfidf, y_train) 

### Evaluate on 1000 evaluation dataset

In [17]:
test_tfidf = tfidf_vectorizer.transform(X_test)
prediction_prob = nb_pol.predict_proba(test_tfidf)[:,1]
prediction = nb_pol.predict(test_tfidf)

In [18]:
test_case_ground_truth = y_test
test_case_ground_truth = np.array(test_case_ground_truth)
test_case_ground_truth

array([ 1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,
       -1,  1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1, -1

In [19]:
prediction

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [20]:
print("Polarity Detection Model Evaluation")
print("1: POSITIVE")
print("-1: NEGATIVE")
print(classification_report(test_case_ground_truth,prediction))
print('Confusion Matrix:\n',confusion_matrix(test_case_ground_truth, prediction))

Polarity Detection Model Evaluation
1: POSITIVE
-1: NEGATIVE
              precision    recall  f1-score   support

          -1       1.00      0.07      0.13        44
           1       0.84      1.00      0.91       216

    accuracy                           0.84       260
   macro avg       0.92      0.53      0.52       260
weighted avg       0.87      0.84      0.78       260

Confusion Matrix:
 [[  3  41]
 [  0 216]]
