In [1]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('../data_cleaning/preprocessed_train_2000_stopwords.csv')
# data = data[['text_lemmatized','sentiment']]
data = data.dropna()
data

Unnamed: 0.1,Unnamed: 0,text,polarity,subjectivity
0,0,this is it for me y all im moving to another c...,0,-1
1,1,i want trump amp his family packing their sing...,0,-1
2,2,if you think of moving to another country in y...,0,-1
3,3,actually he will be moving to another country ...,0,-1
4,4,trump float the idea of moving to another coun...,0,-1
...,...,...,...,...
2033,2033,how about if this ungrateful pompous as quit h...,0,-1
2034,2034,people like this idiot hate america so much it...,-1,1
2035,2035,i am never moving from california unless the d...,-1,1
2036,2036,i really want to study abroad next year but it...,0,-1


## Subjectivity Detection

In [4]:
X_train = data["text"]
y_train = data["subjectivity"]
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

### Combining SVM, RF and LR

In [5]:
base_learners = [
                ('lr', RandomForestClassifier(n_estimators=10, random_state=33)),  
                 ('svm', LinearSVC(random_state=1)),
                 ('NB', MultinomialNB()),
                 ('LR', LogisticRegression())           
                ]

In [6]:
ensemble_clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(), cv=10, stack_method='predict')

In [7]:
ensemble_clf.fit(X_train_tfidf,y_train)

### Testing on 1000 Evaluation Dataset

In [8]:
test_cases = pd.read_csv('../data_cleaning/preprocessed_eval_1000_stopwords.csv')
test_cases

Unnamed: 0.1,Unnamed: 0,text,polarity,subjectivity
0,0,just move to a completely different location i...,0,-1
1,1,i have started sharing information again about...,1,1
2,2,my parent made the tough decision to move thei...,-1,1
3,3,aye you all tryna head out it s too wild over ...,0,-1
4,4,when is val going to move to another country w...,-1,1
...,...,...,...,...
1265,1265,in separation anxiety gavin bradley track the ...,1,1
1266,1266,the crippling asian elder daughter feeling tha...,0,-1
1267,1267,wa on the phone with my dad last night and he ...,-1,1
1268,1268,you may qualify to take advantage of two possi...,0,-1


In [9]:
test_tfidf = tfidf_vectorizer.transform(test_cases['text'])
# prediction_prob = ensemble_clf.predict_proba(test_tfidf)[:,1]
prediction = ensemble_clf.predict(test_tfidf)

In [10]:
test_case_ground_truth = test_cases['subjectivity']

In [11]:
test_case_ground_truth = np.array(test_case_ground_truth)
test_case_ground_truth

array([-1,  1,  1, ...,  1, -1,  1], dtype=int64)

In [12]:
prediction

array([-1, -1,  1, ...,  1, -1, -1], dtype=int64)

In [13]:
print("Subjectivity Detection Model Evaluation")
print("1: OPINIONATED")
print("-1: NEUTRAL")
print(classification_report(test_case_ground_truth,prediction))
print('Confusion Matrix:\n',confusion_matrix(test_case_ground_truth, prediction))

Subjectivity Detection Model Evaluation
1: OPINIONATED
-1: NEUTRAL
              precision    recall  f1-score   support

          -1       0.72      0.67      0.69       608
           1       0.72      0.76      0.74       662

    accuracy                           0.72      1270
   macro avg       0.72      0.72      0.72      1270
weighted avg       0.72      0.72      0.72      1270

Confusion Matrix:
 [[409 199]
 [160 502]]


## Polarity Detection

In [14]:
data_polarity = data[data['polarity']!=0]

In [15]:
data_polarity

Unnamed: 0.1,Unnamed: 0,text,polarity,subjectivity
5,5,i literally can t even shop alone and here i a...,1,1
7,7,still cannot believe that i am moving into ano...,-1,1
10,10,i am surprised a country called the u this is ...,-1,1
11,11,i m honestly packing up my shit and moving to ...,-1,1
12,12,when i went to work in germany for a year i we...,1,1
...,...,...,...,...
2025,2025,yah man u wont go to another country to suppos...,1,1
2028,2028,fuck you gabe kapler move to another country i...,-1,1
2031,2031,a move to another country is looking better ev...,1,1
2034,2034,people like this idiot hate america so much it...,-1,1


#### Convert data to TFIDF

In [16]:
X_train = data_polarity['text']
y_train = data_polarity['polarity']

In [17]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

### Combining SVM, RF and LR

In [18]:
base_learners = [
                 ('lr', RandomForestClassifier(n_estimators=10, random_state=33)),
                 ('svm', LinearSVC(random_state=1)),
                 ('NB', MultinomialNB()),
                 ('LR', LogisticRegression())             
                ]

In [19]:
ensemble_clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(), cv= 10, stack_method='predict')

In [20]:
ensemble_clf.fit(X_train_tfidf,y_train)

In [21]:
test_cases = pd.read_csv('../data_cleaning/preprocessed_eval_1000_stopwords.csv')
test_cases = test_cases[test_cases['polarity']!=0]
test_cases

Unnamed: 0.1,Unnamed: 0,text,polarity,subjectivity
1,1,i have started sharing information again about...,1,1
2,2,my parent made the tough decision to move thei...,-1,1
4,4,when is val going to move to another country w...,-1,1
5,5,if you dont like it here then leave wow great ...,-1,1
6,6,the new american dream is to move to another c...,1,1
...,...,...,...,...
1260,1260,today might be my last day on linkedin premium...,-1,1
1262,1262,im leaving twitter if elon buy it hold the sam...,-1,1
1265,1265,in separation anxiety gavin bradley track the ...,1,1
1267,1267,wa on the phone with my dad last night and he ...,-1,1


In [22]:
test_tfidf = tfidf_vectorizer.transform(test_cases['text'])
prediction_prob = ensemble_clf.predict_proba(test_tfidf)[:,1]
prediction = ensemble_clf.predict(test_tfidf)

In [23]:
test_case_ground_truth = test_cases['polarity']

In [24]:
test_case_ground_truth = np.array(test_case_ground_truth)
test_case_ground_truth

array([ 1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1,
        1, -1, -1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1,  1,
       -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,
       -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
       -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1,
        1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1,  1, -1, -1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1, -1,
       -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1,
       -1, -1, -1, -1, -1

In [25]:
prediction

array([-1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1,  1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1,
        1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1, -1, -1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1, -1, -1,  1

In [26]:
print("Subjectivity Detection Model Evaluation")
print("1: OPINIONATED")
print("-1: NEUTRAL")
print(classification_report(test_case_ground_truth,prediction))
print('Confusion Matrix:\n',confusion_matrix(test_case_ground_truth, prediction))

Subjectivity Detection Model Evaluation
1: OPINIONATED
-1: NEUTRAL
              precision    recall  f1-score   support

          -1       0.84      0.83      0.84       449
           1       0.65      0.67      0.66       213

    accuracy                           0.78       662
   macro avg       0.75      0.75      0.75       662
weighted avg       0.78      0.78      0.78       662

Confusion Matrix:
 [[373  76]
 [ 71 142]]
