In [1]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('../data_cleaning/1000_dataset_cleaned.csv')
# data = data[['text_lemmatized','sentiment']]
data = data.dropna()
data

Unnamed: 0.1,Unnamed: 0,Tweet,opinion_2,sentiment_2
0,0,"Trying to move with a baby, having you mom liv...",-1,0
1,1,I honestly will probably never leave Washingto...,-1,0
2,2,"""Trying to move to another country is a humbli...",-1,0
3,3,Doubtful..move to another country,1,1
4,4,my brother is convinced that imma move to Toro...,1,1
...,...,...,...,...
1334,1334,the urge to move to another country and start ...,1,1
1335,1335,I have done this but I am not contented I need...,1,1
1336,1336,"the urge to fake your death, move to another c...",1,1
1337,1337,"That sudden urge to quit uni, move to another ...",1,1


## Subjectivity Detection

In [4]:
X = data["Tweet"]
y = data["opinion_2"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

### Combining SVM, RF and LR

In [5]:
base_learners = [
                ('lr', RandomForestClassifier(n_estimators=100, random_state=33)),  
                 ('svm', LinearSVC(random_state=1)),
                 ('NB', MultinomialNB()),
                 ('LR', LogisticRegression())           
                ]

In [6]:
ensemble_clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(), cv=20)

In [7]:
ensemble_clf.fit(X_train_tfidf,y_train)

### Testing on 1000 Evaluation Dataset

In [8]:
test_tfidf = tfidf_vectorizer.transform(X_test)
prediction_prob = ensemble_clf.predict_proba(test_tfidf)[:,1]
prediction = ensemble_clf.predict(test_tfidf)

In [9]:
test_case_ground_truth = y_test

In [10]:
test_case_ground_truth = np.array(test_case_ground_truth)
test_case_ground_truth

array([ 1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1, -1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1, -1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1,
        1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,
       -1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1

In [11]:
prediction

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1, -1,
        1,  1,  1,  1, -1

In [12]:
print("Subjectivity Detection Model Evaluation")
print("1: OPINIONATED")
print("-1: NEUTRAL")
print(classification_report(test_case_ground_truth,prediction))
print('Confusion Matrix:\n',confusion_matrix(test_case_ground_truth, prediction))

Subjectivity Detection Model Evaluation
1: OPINIONATED
-1: NEUTRAL
              precision    recall  f1-score   support

          -1       0.67      0.21      0.32        66
           1       0.79      0.97      0.87       202

    accuracy                           0.78       268
   macro avg       0.73      0.59      0.60       268
weighted avg       0.76      0.78      0.73       268

Confusion Matrix:
 [[ 14  52]
 [  7 195]]


## Polarity Detection

In [13]:
data_polarity = data[data['sentiment_2']!=0]

In [14]:
data_polarity

Unnamed: 0.1,Unnamed: 0,Tweet,opinion_2,sentiment_2
3,3,Doubtful..move to another country,1,1
4,4,my brother is convinced that imma move to Toro...,1,1
5,5,The way I want to drop everything and move to ...,1,1
6,6,Uk is such bad vybes that if I wa &gt; 25 with...,1,1
7,7,I say I move to another country /j I cannot,1,1
...,...,...,...,...
1334,1334,the urge to move to another country and start ...,1,1
1335,1335,I have done this but I am not contented I need...,1,1
1336,1336,"the urge to fake your death, move to another c...",1,1
1337,1337,"That sudden urge to quit uni, move to another ...",1,1


#### Convert data to TFIDF

In [15]:
X = data_polarity['Tweet']
y = data_polarity['sentiment_2']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [17]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

### Combining SVM, RF and LR

In [18]:
base_learners = [
                 ('lr', RandomForestClassifier(n_estimators=100, random_state=33)),
                 ('svm', LinearSVC(random_state=1)),
                 ('NB', MultinomialNB()),
                 ('LR', LogisticRegression())             
                ]

In [19]:
ensemble_clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(), cv= 20)

In [20]:
ensemble_clf.fit(X_train_tfidf,y_train)

In [21]:
test_tfidf = tfidf_vectorizer.transform(X_test)
prediction_prob = ensemble_clf.predict_proba(test_tfidf)[:,1]
prediction = ensemble_clf.predict(test_tfidf)

In [22]:
test_case_ground_truth = y_test

In [23]:
test_case_ground_truth = np.array(test_case_ground_truth)
test_case_ground_truth

array([ 1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,
       -1,  1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,
        1,  1,  1,  1], dtype=int64)

In [24]:
prediction

array([ 1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1], dtype=int64)

In [25]:
print("Subjectivity Detection Model Evaluation")
print("1: POSITIVE")
print("-1: NEGATIVE")
print(classification_report(test_case_ground_truth,prediction))
print('Confusion Matrix:\n',confusion_matrix(test_case_ground_truth, prediction))

Subjectivity Detection Model Evaluation
1: POSITIVE
-1: NEGATIVE
              precision    recall  f1-score   support

          -1       0.75      0.45      0.57        33
           1       0.90      0.97      0.94       175

    accuracy                           0.89       208
   macro avg       0.83      0.71      0.75       208
weighted avg       0.88      0.89      0.88       208

Confusion Matrix:
 [[ 15  18]
 [  5 170]]
