# Naive Bayes Classifier

In [2]:
import pandas as pd

### Import data

In [9]:
data = pd.read_csv('../data_cleaning/preprocessed_train_2000.csv')
# data = data[['text_lemmatized','sentiment']]
data = data.dropna()
data

Unnamed: 0.1,Unnamed: 0,text,polarity,subjectivity
0,0,im moving another country starting new life,0,-1
1,1,want trump amp family packing single hold fede...,0,-1
2,2,think moving another country twenties one chal...,0,-1
3,3,actually moving another country one bars windows,0,-1
4,4,trump floats idea moving another country democ...,0,-1
...,...,...,...,...
2033,2033,ungrateful pompous ass quit job move another c...,0,-1
2034,2034,people like idiot hate america much makes wond...,-1,1
2035,2035,never moving california unless democracy kille...,-1,1
2036,2036,really want study abroad next year seems scary...,0,-1


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import nltk

## Subjectivity Detection

### Convert Text to TF-IDF

In [11]:
# X_train, X_test, y_train, y_test = train_test_split(data["text_lemmatized"],data["sentiment"],test_size=0.2,shuffle=True)

In [12]:
X_train = data['text']
y_train = data['subjectivity']

In [13]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) 

### Naive Bayes Classifier

In [14]:
nb_subj = MultinomialNB()
nb_subj.fit(X_train_tfidf, y_train) 

### Results on 1000 Evaluation Dataset

In [15]:
test_cases = pd.read_csv('../data_cleaning/preprocessed_eval_1000.csv')
test_cases

Unnamed: 0.1,Unnamed: 0,text,polarity,subjectivity
0,0,move completely different location costs hundr...,0,-1
1,1,started sharing information move another count...,1,1
2,2,parents made tough decision move family anothe...,-1,1
3,3,aye tryna head wild googles get visa move anot...,0,-1
4,4,val going move another country cooking opinion...,-1,1
...,...,...,...,...
1265,1265,separation anxiety gavin bradley tracks experi...,1,1
1266,1266,crippling asian elder daughter feelings chokes...,0,-1
1267,1267,phone dad last night asking job going amp dati...,-1,1
1268,1268,may qualify take advantage two possible tax ex...,0,-1


In [17]:
test_tfidf = tfidf_vectorizer.transform(test_cases['text'])
prediction_prob = nb_subj.predict_proba(test_tfidf)[:,1]
prediction = nb_subj.predict(test_tfidf)

In [18]:
test_case_ground_truth = test_cases['subjectivity']
test_case_ground_truth = np.array(test_case_ground_truth)
test_case_ground_truth

array([-1,  1,  1, ...,  1, -1,  1], dtype=int64)

In [19]:
prediction

array([-1, -1,  1, ...,  1, -1,  1], dtype=int64)

In [20]:
print("Subjectivity Detection Model Evaluation")
print("1: OPINIONATED")
print("-1: NEUTRAL")
print(classification_report(test_case_ground_truth,prediction))
print('Confusion Matrix:\n',confusion_matrix(test_case_ground_truth, prediction))

Subjectivity Detection Model Evaluation
1: OPINIONATED
-1: NEUTRAL
              precision    recall  f1-score   support

          -1       0.69      0.62      0.65       608
           1       0.68      0.75      0.71       662

    accuracy                           0.69      1270
   macro avg       0.69      0.68      0.68      1270
weighted avg       0.69      0.69      0.68      1270

Confusion Matrix:
 [[376 232]
 [168 494]]


## Polarity Detection

In [21]:
data_polarity = data[data['polarity']!=0]
data_polarity

Unnamed: 0.1,Unnamed: 0,text,polarity,subjectivity
5,5,literally even shop alone thinking moving anot...,1,1
7,7,still believe moving another country leaving f...,-1,1
10,10,surprised country called us ethiopian issue mo...,-1,1
11,11,honestly packing shit moving another country 2...,-1,1
12,12,went work germany year went offered 66 pay ris...,1,1
...,...,...,...,...
2025,2025,yah man u wont go another country supposedly m...,1,1
2028,2028,fuck gabe kapler move another country like one...,-1,1
2031,2031,move another country looking better everyday s...,1,1
2034,2034,people like idiot hate america much makes wond...,-1,1


In [22]:
X_train = data_polarity['text']
y_train = data_polarity['polarity']

#### Convert data to TFIDF

In [23]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

### Naive Bayes Classifier

In [24]:
nb_pol = MultinomialNB()
nb_pol.fit(X_train_tfidf, y_train) 

### Evaluate on 1000 evaluation dataset

In [25]:
test_cases = pd.read_csv('../data_cleaning/preprocessed_eval_1000.csv')
test_cases = test_cases[test_cases['polarity']!=0]
test_cases

Unnamed: 0.1,Unnamed: 0,text,polarity,subjectivity
1,1,started sharing information move another count...,1,1
2,2,parents made tough decision move family anothe...,-1,1
4,4,val going move another country cooking opinion...,-1,1
5,5,dont like leave wow great idea hey everybody k...,-1,1
6,6,new american dream move another country roevswade,1,1
...,...,...,...,...
1260,1260,today might last day linkedin premium getting ...,-1,1
1262,1262,im leaving twitter elon buys holds energy im m...,-1,1
1265,1265,separation anxiety gavin bradley tracks experi...,1,1
1267,1267,phone dad last night asking job going amp dati...,-1,1


In [27]:
test_tfidf = tfidf_vectorizer.transform(test_cases['text'])
prediction_prob = nb_pol.predict_proba(test_tfidf)[:,1]
prediction = nb_pol.predict(test_tfidf)

In [28]:
test_case_ground_truth = test_cases['polarity']
test_case_ground_truth = np.array(test_case_ground_truth)
test_case_ground_truth

array([ 1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1,
        1, -1, -1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1,  1,
       -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,
       -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
       -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1,
        1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1,  1, -1, -1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1, -1,
       -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1,
       -1, -1, -1, -1, -1

In [29]:
prediction

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [30]:
print("Polarity Detection Model Evaluation")
print("1: POSITIVE")
print("-1: NEGATIVE")
print(classification_report(test_case_ground_truth,prediction))
print('Confusion Matrix:\n',confusion_matrix(test_case_ground_truth, prediction))

Polarity Detection Model Evaluation
1: POSITIVE
-1: NEGATIVE
              precision    recall  f1-score   support

          -1       0.70      0.98      0.82       449
           1       0.73      0.13      0.22       213

    accuracy                           0.70       662
   macro avg       0.72      0.55      0.52       662
weighted avg       0.71      0.70      0.62       662

Confusion Matrix:
 [[439  10]
 [186  27]]
