In [1]:
import pandas as pd 
import numpy as np
import nltk
import re 
import os 
import random 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer 
from nltk.tokenize import RegexpTokenizer
import pickle
import time
import warnings
warnings.filterwarnings("ignore")
df1 = pd.read_csv('manual.csv').iloc[:,1:]

## 0. Preprocess

In [2]:
import re
import nltk
def clean(text):
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text
df1['cleaned'] = df1['Text'].apply(clean)
from tqdm.notebook import tqdm
tqdm.pandas()

nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

df1['tagged'] = df1['cleaned'].progress_apply(token_stop_pos)

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

df1['lemma'] = df1['tagged'].progress_apply(lemmatize)

from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()
def stemmize(tagged):
    stemmed = ''
    for word, _ in tagged:
        stemmed = stemmed +' '+lancaster.stem(word)
    return stemmed.strip()

df1['stemmed'] = df1['tagged'].progress_apply(stemmize)




[nltk_data] Downloading package punkt to /Users/zhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/zhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zhang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zhang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

In [37]:
df1['Text'][0]

'@lindseyhilsum Ms Hilsum sad the atrocities committed across Ukraine and the war crimes. It appears war crimes are only committed by Russia in Aleppo and now on Ukraine. Where is your war crimes coverage in Iraq, Afghanistan also in Syria when west were arming the resistance to remove Assad?'

In [4]:
neutral_text = list(df1[df1.label==0]['stemmed'])
neutral_label = [0]*len(neutral_text)
opinioned_text = list(df1[df1.label!=0]['stemmed'])
opinioned_label = [1]*len(opinioned_text)
len(neutral_text),len(opinioned_text)

(536, 664)

In [5]:
neg_text = list(df1[df1.label==-1]['stemmed'])
neg_label = [0]*len(neg_text)
pos_text = list(df1[df1.label==1]['stemmed'])
pos_label = [1]*len(pos_text)
len(pos_text),len(neg_text)

(312, 352)

In [6]:
from sklearn.model_selection import train_test_split
subject_text = neutral_text+opinioned_text
subject_label = neutral_label+opinioned_label

polarity_text = pos_text+neg_text
polarity_label = pos_label+neg_label

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tf=TfidfVectorizer()
text_tf= tf.fit_transform(polarity_text)
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(text_tf, polarity_label, test_size=0.2, random_state=114514)

In [9]:
with open('tf-pkl/polarity_text.pkl','wb') as f:
    pickle.dump(tf,f)

In [10]:
tf=TfidfVectorizer()
text_tf= tf.fit_transform(subject_text)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(text_tf, subject_label, test_size=0.2, random_state=114514)

In [11]:
with open('tf-pkl/subject_text.pkl','wb') as f:
    pickle.dump(tf,f)

In [12]:
X_train_p = X_train_p.toarray()
X_test_p = X_test_p.toarray()
X_train_s = X_train_s.toarray()
X_test_s = X_test_s.toarray()

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

## 1. Subjectivity Detection

In [14]:
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors':list(range(1,20,2))+list(range(20,105,5)),
    'weights':['uniform', 'distance']
}

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic_params = {
    'penalty':['l1','l2','elasticnet','none']
}


from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_params = {
    'n_estimators':list(range(2,21,2)),
    'criterion':['gini','entropy'],
}


from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc_params = {
    'criterion':['gini', 'entropy'],
    'splitter':['best','random']
}

from sklearn.svm import SVC
rbf_svc = SVC()
rbf_svc_params = {
    'C':[0.01,0.1,1,10]
}

from sklearn.svm import SVC
lin_svc = SVC()
lin_svc_params = {
    'kernel':['linear'],
    'C':[0.01,0.1,1,10]
}


from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb_params = {
    'alpha':[0,0.01,0.1,1,10]
}

from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc_params = {
    'loss':['deviance','exponential'],
    'learning_rate':[0.0001,0.001,0.01,0.1],
    'n_estimators':[2,10,50,100,200],
    'criterion':['friedman_mse','squared_error','mse','mae']
}


from sklearn.ensemble import  AdaBoostClassifier
abc = AdaBoostClassifier()
abc_params={
    'learning_rate':[0.0001,0.001,0.01,0.1],
    'n_estimators':[2,10,50,100,200],
}


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda_params = {
    'solver':['svd','lsqr','eigen']
}

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda_params = {
    'reg_param':[0,0.001,0.001,0.01,0.1]
}

from sklearn.neural_network import MLPClassifier
nn = MLPClassifier()
nn_params = {
    'alpha':[0.01,0.1,1],
    'max_iter':[2000]
}



classifiers = {
    'knn':knn,
    'logistic_regression':logistic,
    'random_forest':rfc,
    'decision_tree':dtc,
    'rbf_svc':rbf_svc,
    'linear_svc':lin_svc,
    'naive_bayes':mnb,
    'adaboost':abc,
    'linear_discriminant':lda,
    'quadratic_discriminant':qda,
    'neural_network':nn
    
    
}
params = {
    'knn':knn_params,
    'logistic_regression':logistic_params,
    'random_forest':rfc_params,
    'decision_tree':dtc_params,
    'rbf_svc':rbf_svc_params,
    'linear_svc':lin_svc_params,
    'naive_bayes':mnb_params,
    'adaboost':abc_params,
    'linear_discriminant':lda_params,
    'quadratic_discriminant':qda_params,
    'neural_network':nn_params
}

best={}


In [286]:
for k in tqdm(classifiers):
    gcv = GridSearchCV(estimator=classifiers[k], param_grid=params[k], n_jobs=-1)
    gcv.fit(X_train_s, y_train_s)
    best[k]=gcv.best_estimator_
    print(k)

  0%|          | 0/11 [00:00<?, ?it/s]

knn


Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit


logistic_regression
random_forest
decision_tree
rbf_svc
linear_svc




naive_bayes
adaboost


Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/discriminant_analysis.py", line 558, in fit
    self._solve_eigen(X, y,
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/discriminant_analysis.py", line 419, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/scipy/linalg/decomp.py", line 578, in eigh
    raise LinAlgError('The leading minor of order {} of B is not '
numpy.linalg.LinAlgError: The leading minor of order 2 of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed.

Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py

linear_discriminant




quadratic_discriminant


TypeError: 'MLPClassifier' object is not iterable

In [287]:
score_dict = {}
for k in tqdm(best):
    with open(f'sklearn-pkl/subjectivity/{k}.pkl','wb') as f:
        pickle.dump(best[k],f)
    y_pred_s = best[k].predict(X_test_s)
    score_dict[k]={
        'accuracy':accuracy_score(y_pred_s,y_test_s),
        'precision':precision_score(y_pred_s,y_test_s),
        'recall':recall_score(y_pred_s,y_test_s),
        'f1':f1_score(y_pred_s,y_test_s),
        'Details':str(best[k])
    }

  0%|          | 0/10 [00:00<?, ?it/s]

In [302]:
subjectivity_result = pd.DataFrame(score_dict).T
subjectivity_result.to_csv('subjectivity_result.csv',index=False)
subjectivity_result

Unnamed: 0,accuracy,precision,recall,f1,Details
knn,0.770833,0.811594,0.794326,0.802867,KNeighborsClassifier(n_neighbors=20)
logistic_regression,0.808333,0.847826,0.823944,0.835714,LogisticRegression(penalty='none')
random_forest,0.779167,0.702899,0.889908,0.785425,RandomForestClassifier(n_estimators=20)
decision_tree,0.7125,0.637681,0.82243,0.718367,"DecisionTreeClassifier(criterion='entropy', sp..."
rbf_svc,0.816667,0.862319,0.826389,0.843972,SVC(C=10)
linear_svc,0.8125,0.84058,0.834532,0.837545,"SVC(C=10, kernel='linear')"
naive_bayes,0.783333,0.934783,0.75,0.832258,MultinomialNB(alpha=0.1)
adaboost,0.783333,0.746377,0.858333,0.79845,"AdaBoostClassifier(learning_rate=0.1, n_estima..."
linear_discriminant,0.629167,0.73913,0.658065,0.696246,LinearDiscriminantAnalysis()
quadratic_discriminant,0.575,1.0,0.575,0.730159,QuadraticDiscriminantAnalysis(reg_param=0.001)


In [15]:
clf = MLPClassifier(alpha=0.1, max_iter=2000)

In [16]:
clf.fit(X_train_s, y_train_s)

MLPClassifier(alpha=0.1, max_iter=2000)

In [17]:
tf=TfidfVectorizer()
text_tf= tf.fit_transform(subject_text)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(text_tf, subject_label, test_size=0.2, random_state=114514)

In [27]:
subject_text[10]

'check anarchotok video tiktok https co kjyliffxfx ukrain russiaukraineconflict'

In [24]:
clf.predict(X_test_s)
# tf.transform(['hi'])

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1])

## 2. Polarity Detection

In [304]:
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors':list(range(1,20,2))+list(range(20,105,5)),
    'weights':['uniform', 'distance']
}

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic_params = {
    'penalty':['l1','l2','elasticnet','none']
}


from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_params = {
    'n_estimators':list(range(2,21,2)),
    'criterion':['gini','entropy'],
}


from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc_params = {
    'criterion':['gini', 'entropy'],
    'splitter':['best','random']
}

from sklearn.svm import SVC
rbf_svc = SVC()
rbf_svc_params = {
    'C':[0.01,0.1,1,10]
}

from sklearn.svm import SVC
lin_svc = SVC()
lin_svc_params = {
    'kernel':['linear'],
    'C':[0.01,0.1,1,10]
}


from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb_params = {
    'alpha':[0,0.01,0.1,1,10]
}

from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc_params = {
    'loss':['deviance','exponential'],
    'learning_rate':[0.0001,0.001,0.01,0.1],
    'n_estimators':[2,10,50,100,200],
    'criterion':['friedman_mse','squared_error','mse','mae']
}


from sklearn.ensemble import  AdaBoostClassifier
abc = AdaBoostClassifier()
abc_params={
    'learning_rate':[0.0001,0.001,0.01,0.1],
    'n_estimators':[2,10,50,100,200],
}


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda_params = {
    'solver':['svd','lsqr','eigen']
}

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda_params = {
    'reg_param':[0,0.001,0.001,0.01,0.1]
}

from sklearn.neural_network import MLPClassifier
nn = MLPClassifier()
nn_params = {
    'alpha':[0.01,0.1,1],
    'max_iter':[2000]
}



classifiers = {
    'knn':knn,
    'logistic_regression':logistic,
    'random_forest':rfc,
    'decision_tree':dtc,
    'rbf_svc':rbf_svc,
    'linear_svc':lin_svc,
    'naive_bayes':mnb,
    'adaboost':abc,
    'linear_discriminant':lda,
    'quadratic_discriminant':qda,
    'neural_network':nn
    
    
}
params = {
    'knn':knn_params,
    'logistic_regression':logistic_params,
    'random_forest':rfc_params,
    'decision_tree':dtc_params,
    'rbf_svc':rbf_svc_params,
    'linear_svc':lin_svc_params,
    'naive_bayes':mnb_params,
    'adaboost':abc_params,
    'linear_discriminant':lda_params,
    'quadratic_discriminant':qda_params,
    'neural_network':nn_params
}

best={}

In [307]:
for k in tqdm(classifiers):
    gcv = GridSearchCV(estimator=classifiers[k], param_grid=params[k], n_jobs=-1)
    gcv.fit(X_train_p, y_train_p)
    best[k]=gcv.best_estimator_
    print(k)

  0%|          | 0/11 [00:00<?, ?it/s]

knn


Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit


logistic_regression
random_forest
decision_tree
rbf_svc
linear_svc




naive_bayes
adaboost


Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/discriminant_analysis.py", line 558, in fit
    self._solve_eigen(X, y,
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/discriminant_analysis.py", line 419, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/scipy/linalg/decomp.py", line 578, in eigh
    raise LinAlgError('The leading minor of order {} of B is not '
numpy.linalg.LinAlgError: The leading minor of order 7 of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed.

Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py

linear_discriminant




quadratic_discriminant
neural_network


In [309]:
score_dict = {}
for k in tqdm(best):
    with open(f'sklearn-pkl/polarity/{k}.pkl','wb') as f:
        pickle.dump(best[k],f)
    y_pred_p = best[k].predict(X_test_p)
    score_dict[k]={
        'accuracy':accuracy_score(y_pred_p,y_test_p),
        'precision':precision_score(y_pred_p,y_test_p),
        'recall':recall_score(y_pred_p,y_test_p),
        'f1':f1_score(y_pred_p,y_test_p),
        'Details':str(best[k])
    }

  0%|          | 0/11 [00:00<?, ?it/s]

In [311]:
polarity_result = pd.DataFrame(score_dict).T
polarity_result.to_csv('polarity_result.csv')
polarity_result

Unnamed: 0,accuracy,precision,recall,f1,Details
knn,0.804511,0.738462,0.842105,0.786885,KNeighborsClassifier(n_neighbors=45)
logistic_regression,0.849624,0.861538,0.835821,0.848485,LogisticRegression(penalty='none')
random_forest,0.766917,0.615385,0.869565,0.720721,RandomForestClassifier(n_estimators=18)
decision_tree,0.729323,0.692308,0.737705,0.714286,"DecisionTreeClassifier(criterion='entropy', sp..."
rbf_svc,0.842105,0.753846,0.907407,0.823529,SVC(C=10)
linear_svc,0.842105,0.861538,0.823529,0.842105,"SVC(C=10, kernel='linear')"
naive_bayes,0.804511,0.769231,0.819672,0.793651,MultinomialNB(alpha=0.1)
adaboost,0.774436,0.753846,0.777778,0.765625,"AdaBoostClassifier(learning_rate=0.1, n_estima..."
linear_discriminant,0.578947,0.307692,0.645161,0.416667,LinearDiscriminantAnalysis()
quadratic_discriminant,0.511278,0.0,0.0,0.0,QuadraticDiscriminantAnalysis(reg_param=0.001)


## 3. Ensemble classiﬁcation
for Polarity

In [318]:
from sklearn.ensemble import StackingClassifier
estimators = list(best.items())
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

In [320]:
X_train,X_test = X_train.toarray(),X_test.toarray()

In [321]:
clf.fit(X_train, y_train)

StackingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=45)),
                               ('logistic_regression',
                                LogisticRegression(penalty='none')),
                               ('random_forest',
                                RandomForestClassifier(n_estimators=18)),
                               ('decision_tree',
                                DecisionTreeClassifier(criterion='entropy',
                                                       splitter='random')),
                               ('rbf_svc', SVC(C=10)),
                               ('linear_svc', SVC(C=10, kernel='linear')),
                               ('naive_bayes', MultinomialNB(alpha=0.1)),
                               ('adaboost',
                                AdaBoostClassifier(learning_rate=0.1,
                                                   n_estimators=100)),
                               ('linear_discriminant',
                              

In [322]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
recall = recall_score(y_pred,y_test)
precision = precision_score(y_pred,y_test)
f1 = f1_score(y_pred,y_test)
print('accuracy: ',accuracy)
print('recall: ',recall)
print('precision: ',precision)
print('f1: ',f1)

accuracy:  0.8421052631578947
recall:  0.8142857142857143
precision:  0.8769230769230769
f1:  0.8444444444444444


## 4. Multitask classiﬁcation
classify subjectivity and polarity at the same time

In [345]:
text = df1.stemmed
label = df1.label

In [348]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
tf=TfidfVectorizer()
text_tf= tf.fit_transform(text)
X_train, X_test, y_train, y_test = train_test_split(text_tf, label, test_size=0.2, random_state=114514)
X_train, X_test = X_train.toarray(), X_test.toarray()

In [349]:
with open('tf-pkl/multi_task.pkl','wb') as f:
    pickle.dump(tf,f)

In [334]:
knn = KNeighborsClassifier(n_neighbors=10)
logistic = LogisticRegression()
rfc = RandomForestClassifier(n_estimators=10)
dtc = tree.DecisionTreeClassifier()
rbf_svc = SVC(C=0.125)
lin_svc = SVC(kernel='linear',C=1)
mnb = MultinomialNB(alpha=0.1)
abc = AdaBoostClassifier()
nn = MLPClassifier(alpha=0.1,max_iter=2000)
classifiers = {
    'knn':knn,
    'logistic_regression':logistic,
    'random_forest':rfc,
    'decision_tree':dtc,
    # 'rbf_svc':rbf_svc,
    # 'linear_svc':lin_svc,
    'naive_bayes':mnb,
    'adaboost':abc,
    # 'linear_discriminant':lda,
    # 'quadratic_discriminant':qda,
    'neural_network':nn
}


In [335]:
for k in tqdm(classifiers):
    classifiers[k]=classifiers[k].fit(X_train,y_train)
    print(k)

  0%|          | 0/7 [00:00<?, ?it/s]

knn
logistic_regression
random_forest
decision_tree
naive_bayes
adaboost
neural_network


In [336]:
score_dict = {}
for k in tqdm(classifiers):
    with open(f'sklearn-pkl/geo/{k}.pkl','wb') as f:
        pickle.dump(classifiers[k],f)
    y_pred = classifiers[k].predict(X_test)
    score_dict[k]={
        'accuracy':accuracy_score(y_pred,y_test),
        'Details':str(classifiers[k])
    }

  0%|          | 0/7 [00:00<?, ?it/s]

In [338]:
# multitask accuracy
pd.DataFrame(score_dict).T

Unnamed: 0,accuracy,Details
knn,0.666667,KNeighborsClassifier(n_neighbors=10)
logistic_regression,0.7125,LogisticRegression()
random_forest,0.679167,RandomForestClassifier(n_estimators=10)
decision_tree,0.641667,DecisionTreeClassifier()
naive_bayes,0.745833,MultinomialNB(alpha=0.1)
adaboost,0.7125,AdaBoostClassifier()
neural_network,0.766667,"MLPClassifier(alpha=0.1, max_iter=2000)"
