In [1]:
!pip install spark-sklearn

In [2]:
import pandas as pd
!wget https://github.com/nlp-pucrs/cross-media-sa/raw/master/finalized_dataset.csv.gz
result = pd.read_csv('finalized_dataset.csv.gz')
len(result), result.columns

In [3]:
columns = ['funct','pronoun','ppron','i','we','you','shehe','they','ipron','article','verb','auxverb','past','present','future','adverb','preps','conj','negate','quant','number','swear','social','family','friend','humans','affect','posemo','negemo','anx','anger','sad','cogmech','insight','cause','discrep','tentat','certain','inhib','incl','excl','percept','see','hear','feel','bio','body','health','sexual','ingest','relativ','motion','space','time','work','achieve','leisure','home','money','relig','death','assent','nonfl','filler']
t4sa_columns = ['neg_prob','neu_prob','pos_prob']

In [4]:
from sklearn.ensemble import RandomForestClassifier
from spark_sklearn import GridSearchCV
from spark_sklearn.util import createLocalSparkSession
from sklearn.model_selection import cross_val_score
import numpy as np
sc = createLocalSparkSession().sparkContext
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10, 15],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "n_estimators": [10, 20, 40, 80],
              "criterion": ["gini", "entropy"]}
clf = RandomForestClassifier()
gs = GridSearchCV(sc, clf, param_grid=param_grid, error_score=0)

In [5]:
data_t4sa = result[t4sa_columns].values
data_liwc = result[columns].values
columns.extend(t4sa_columns)
data = result[columns].values
target = result['what_is_the_sentiment_feeling_of_the_image'].values

In [6]:
# T4SA + LIWC (All)
gs.fit(data, target)
gs.best_score_, gs.best_params_

In [7]:
clf = RandomForestClassifier(**gs.best_params_)
np.mean(cross_val_score(clf, data, target, cv=10))

In [8]:
# T4SA (All)
gs.fit(data_t4sa, target)
gs.best_score_, gs.best_params_

In [9]:
clf = RandomForestClassifier(**gs.best_params_)
np.mean(cross_val_score(clf, data_t4sa, target, cv=10))

In [10]:
# T4SA + LIWC (All)
gs.fit(data_liwc, target)
gs.best_score_, gs.best_params_

In [11]:
clf = RandomForestClassifier(**gs.best_params_)
np.mean(cross_val_score(clf, data_liwc, target, cv=10))

In [12]:
result_clean = result[result['qual_o_sentimento_predominante_no_texto'] != 1]
result_clean = result_clean[result_clean['what_is_the_sentiment_feeling_of_the_image'] != 1]

data_t4sa = result_clean[t4sa_columns].values
data_liwc = result_clean[columns].values
columns.extend(t4sa_columns)
data = result_clean[columns].values
target = result_clean['what_is_the_sentiment_feeling_of_the_image'].values

In [13]:
# T4SA + LIWC
gs.fit(data, target)
gs.best_score_, gs.best_params_

In [14]:
clf = RandomForestClassifier(**gs.best_params_)
np.mean(cross_val_score(clf, data, target, cv=10))

In [15]:
# T4SA
gs.fit(data_t4sa, target)
gs.best_score_, gs.best_params_

In [16]:
clf = RandomForestClassifier(**gs.best_params_)
np.mean(cross_val_score(clf, data_t4sa, target, cv=10))

In [17]:
# LIWC
gs.fit(data_liwc, target)
gs.best_score_, gs.best_params_

In [18]:
clf = RandomForestClassifier(**gs.best_params_)
np.mean(cross_val_score(clf, data_liwc, target, cv=10))