In [1]:
periodo = 'pre-pandemia'

# Importe as bibliotecas necessárias

In [2]:
%matplotlib inline
from warnings import filterwarnings

from IPython.display import clear_output
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score 
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree._classes import DecisionTreeClassifier
import multiprocessing
import numpy as np
import pandas as pd

filterwarnings('ignore')
import tensorflow as tf


np.random.seed(42)
tf.random.set_seed(42)

n_jobs = multiprocessing.cpu_count() - 1




In [3]:
def avaliacao(y_val, predicao):
    '''
    Computa as mericas de avaliação dos clasficadores
    '''
        
    precisao = precision_score(y_val, predicao) * 100
    revocacao = recall_score(y_val, predicao) * 100
    f1score = f1_score(y_val, predicao) * 100
    acuracia = accuracy_score(y_val, predicao) * 100
    print('precisao {:.2f}'.format(precisao))
    print('revocacao {:.2f}'.format(revocacao))
    print('f1-score {:.2f}'.format(f1score))
    print('acuracia {:.2f}'.format(acuracia))
    print()
    
def modeloML(model, x_train, y_train, x_val, y_val, cv, name):
    
    print(name)
    model.fit(x_train, y_train)
    predicao = cross_val_predict(model, x_val, y_val, cv=cv)
    avaliacao(y_val, predicao)
    
    #plotarMatrizConfusao(y_val, predicao, nome_classificador)


In [4]:

train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

choq = ['volumeTweets_media', 'volumeTweets_variancia', 'volumeTweets_mediaMovelPonterada', 'volumeTweets_entropia', 
 'indiceInsonia_mediaMovelPonterada', 'indiceInsonia_entropia', 
 'pronome1Pessoa_media', 'pronome1Pessoa_variancia', 'pronome1Pessoa_mediaMovelPonterada', 'pronome1Pessoa_entropia', 
 'pronome2Pessoa_mediaMovelPonterada', 
 'pronome3Pessoa_variancia', 'pronome3Pessoa_mediaMovelPonterada', 'pronome3Pessoa_entropia', 
 'valencia_variancia', 'valencia_entropia', 
 'ativacao_variancia', 'ativacao_entropia', 
 'termosDepressivos_mediaMovelPonterada', 
 'grafoSocial_variancia', 'grafoSocial_mediaMovelPonterada', 'grafoSocial_entropia',
 'medicamentosAntiDepressivo_variancia'
 ]


atributos_anterioes = ['caracteresOrientais_variancia', 
 'emojis_mediaMovelPonterada', 
 'curtidas_media', 'curtidas_variancia', 'curtidas_mediaMovelPonterada',
 'midia_variancia',
 'links_entropia',
 ]

atributos_novos = ['hashtags_variancia', 'hashtags_mediaMovelPonterada', 
                   'retweets_media', 'retweets_variancia', 'retweets_mediaMovelPonterada', 
 'mencoes_media', 'mencoes_variancia', 'mencoes_mediaMovelPonterada', 
 'polaridade_entropia', 
 'subjetividade_entropia']




In [5]:
df = pd.read_csv(f'datasets/twitterbase_{periodo}.csv', sep=';')
label_encoder = LabelEncoder()
df['classe'] = label_encoder.fit_transform(df['classe']).astype('float64')

cv = KFold(n_splits=10, shuffle=True, random_state=42)
zscore = StandardScaler()

x = df.drop('classe', axis=1).copy()
y = df['classe'].copy()
x = zscore.fit_transform(x)

df1 = df[choq + ['classe']].copy()
df2 = df[choq + atributos_anterioes + ['classe']].copy()
df3 = df[choq + atributos_novos + ['classe']].copy()

x1 = df1.drop('classe', axis=1).copy()
y1 = df1['classe'].copy()
x1 = zscore.fit_transform(x1)

x2 = df2.drop('classe', axis=1).copy()
y2 = df2['classe'].copy()
x2 = zscore.fit_transform(x2)

x3 = df3.drop('classe', axis=1).copy()
y3 = df3['classe'].copy()
x3 = zscore.fit_transform(x3)

x4 = df.drop('classe', axis=1).copy()
y4 = df['classe'].copy()
x4 = zscore.fit_transform(x4)

In [6]:
x_train, x_val, y_train, y_val = train_test_split(x1, y1, test_size=validation_ratio, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=test_ratio, random_state=42)

baseline = DecisionTreeClassifier(random_state=42, max_depth=10, criterion='gini')
florestaRandomica = RandomForestClassifier(random_state=42, verbose=0, n_jobs=n_jobs, max_depth=10, criterion='gini', n_estimators=50)
perceptronMulticamadas = MLPClassifier(random_state=42, max_iter=100, n_iter_no_change=20, batch_size=128, early_stopping=True, solver='adam', shuffle=True, activation='relu', hidden_layer_sizes=(50, 50))
svm = LinearSVC(random_state=42, verbose=0, max_iter=100, C=0.1, loss='squared_hinge', penalty='l2')
modeloML(baseline, x_train, y_train, x_val, y_val, cv, 'Árvore de Decisão (Baseline)')
modeloML(florestaRandomica, x_train, y_train, x_val, y_val, cv, 'Floresta Randômica')
modeloML(perceptronMulticamadas, x_train, y_train, x_val, y_val, cv, 'Perceptron Multicamadas')
modeloML(svm, x_train, y_train, x_val, y_val, cv, 'SVM')

Árvore de Decisão (Baseline)
precisao 71.15
revocacao 82.84
f1-score 76.55
acuracia 73.21

Floresta Randômica
precisao 73.35
revocacao 86.01
f1-score 79.18
acuracia 76.12

Perceptron Multicamadas
precisao 73.99
revocacao 83.96
f1-score 78.66
acuracia 75.95

SVM
precisao 75.73
revocacao 68.71
f1-score 72.05
acuracia 71.86



In [7]:
x_train, x_val, y_train, y_val = train_test_split(x2, y2, test_size=validation_ratio, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=test_ratio, random_state=42)

baseline = DecisionTreeClassifier(random_state=42, max_depth=10, criterion='gini')
florestaRandomica = RandomForestClassifier(random_state=42, verbose=0, n_jobs=n_jobs, max_depth=10, criterion='gini', n_estimators=50)
perceptronMulticamadas = MLPClassifier(random_state=42, max_iter=100, n_iter_no_change=20, batch_size=128, early_stopping=True, solver='adam', shuffle=True, activation='relu', hidden_layer_sizes=(50, 50))
svm = LinearSVC(random_state=42, verbose=0, max_iter=100, C=0.1, loss='squared_hinge', penalty='l2')
modeloML(baseline, x_train, y_train, x_val, y_val, cv, 'Árvore de Decisão (Baseline)')
modeloML(florestaRandomica, x_train, y_train, x_val, y_val, cv, 'Floresta Randômica')
modeloML(perceptronMulticamadas, x_train, y_train, x_val, y_val, cv, 'Perceptron Multicamadas')
modeloML(svm, x_train, y_train, x_val, y_val, cv, 'SVM')

Árvore de Decisão (Baseline)
precisao 71.93
revocacao 81.63
f1-score 76.48
acuracia 73.49

Floresta Randômica
precisao 73.82
revocacao 86.46
f1-score 79.64
acuracia 76.66

Perceptron Multicamadas
precisao 74.66
revocacao 82.49
f1-score 78.38
acuracia 75.98

SVM
precisao 75.79
revocacao 69.38
f1-score 72.45
acuracia 72.14



In [8]:
x_train, x_val, y_train, y_val = train_test_split(x3, y3, test_size=validation_ratio, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=test_ratio, random_state=42)

baseline = DecisionTreeClassifier(random_state=42, max_depth=10, criterion='gini')
florestaRandomica = RandomForestClassifier(random_state=42, verbose=0, n_jobs=n_jobs, max_depth=10, criterion='gini', n_estimators=50)
perceptronMulticamadas = MLPClassifier(random_state=42, max_iter=100, n_iter_no_change=20, batch_size=128, early_stopping=True, solver='adam', shuffle=True, activation='relu', hidden_layer_sizes=(50, 50))
svm = LinearSVC(random_state=42, verbose=0, max_iter=100, C=0.1, loss='squared_hinge', penalty='l2')
modeloML(baseline, x_train, y_train, x_val, y_val, cv, 'Árvore de Decisão (Baseline)')
modeloML(florestaRandomica, x_train, y_train, x_val, y_val, cv, 'Floresta Randômica')
modeloML(perceptronMulticamadas, x_train, y_train, x_val, y_val, cv, 'Perceptron Multicamadas')
modeloML(svm, x_train, y_train, x_val, y_val, cv, 'SVM')

Árvore de Decisão (Baseline)
precisao 71.98
revocacao 81.88
f1-score 76.61
acuracia 73.61

Floresta Randômica
precisao 73.86
revocacao 86.69
f1-score 79.76
acuracia 76.77

Perceptron Multicamadas
precisao 74.46
revocacao 84.75
f1-score 79.27
acuracia 76.60

SVM
precisao 76.33
revocacao 69.67
f1-score 72.85
acuracia 72.58



In [9]:
x_train, x_val, y_train, y_val = train_test_split(x4, y4, test_size=validation_ratio, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=test_ratio, random_state=42)

baseline = DecisionTreeClassifier(random_state=42, max_depth=10, criterion='gini')
florestaRandomica = RandomForestClassifier(random_state=42, verbose=0, n_jobs=n_jobs, max_depth=10, criterion='gini', n_estimators=50)
perceptronMulticamadas = MLPClassifier(random_state=42, max_iter=100, n_iter_no_change=20, batch_size=128, early_stopping=True, solver='adam', shuffle=True, activation='relu', hidden_layer_sizes=(50, 50))
svm = LinearSVC(random_state=42, verbose=0, max_iter=100, C=0.1, loss='squared_hinge', penalty='l2')
modeloML(baseline, x_train, y_train, x_val, y_val, cv, 'Árvore de Decisão (Baseline)')
modeloML(florestaRandomica, x_train, y_train, x_val, y_val, cv, 'Floresta Randômica')
modeloML(perceptronMulticamadas, x_train, y_train, x_val, y_val, cv, 'Perceptron Multicamadas')
modeloML(svm, x_train, y_train, x_val, y_val, cv, 'SVM')

Árvore de Decisão (Baseline)
precisao 71.46
revocacao 80.69
f1-score 75.80
acuracia 72.79

Floresta Randômica
precisao 74.70
revocacao 87.22
f1-score 80.48
acuracia 77.66

Perceptron Multicamadas
precisao 77.10
revocacao 81.83
f1-score 79.39
acuracia 77.58

SVM
precisao 76.05
revocacao 72.66
f1-score 74.32
acuracia 73.49



# Pandemia

In [10]:
periodo = 'pandemia'

In [11]:

train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

choq = ['volumeTweets_media', 'volumeTweets_variancia', 'volumeTweets_mediaMovelPonterada', 'volumeTweets_entropia', 
'indiceInsonia_variancia', 'indiceInsonia_mediaMovelPonterada', 'indiceInsonia_entropia', 
'pronome1Pessoa_media', 'pronome1Pessoa_variancia', 'pronome1Pessoa_mediaMovelPonterada', 'pronome1Pessoa_entropia', 
'pronome2Pessoa_mediaMovelPonterada', 
'pronome3Pessoa_media', 'pronome3Pessoa_variancia', 'pronome3Pessoa_mediaMovelPonterada', 'pronome3Pessoa_entropia', 
'valencia_mediaMovelPonterada', 'valencia_entropia', 
'ativacao_mediaMovelPonterada', 'ativacao_entropia', 
'termosDepressivos_variancia', 
'grafoSocial_variancia', 'grafoSocial_mediaMovelPonterada', 'grafoSocial_entropia',
'medicamentosAntiDepressivo_mediaMovelPonterada'
]


atributos_anterioes = ['caracteresOrientais_variancia', 
'emojis_variancia', 
'midia_variancia', 'midia_mediaMovelPonterada', 
'curtidas_media', 'curtidas_variancia', 'curtidas_mediaMovelPonterada',
'links_mediaMovelPonterada'
]

atributos_novos = ['hashtags_variancia', 'hashtags_mediaMovelPonterada', 
'retweets_variancia', 
'mencoes_variancia', 'mencoes_mediaMovelPonterada', 
'polaridade_entropia', 
'subjetividade_entropia'
]

In [12]:
df = pd.read_csv(f'datasets/twitterbase_{periodo}.csv', sep=';')
label_encoder = LabelEncoder()
df['classe'] = label_encoder.fit_transform(df['classe']).astype('float64')

cv = KFold(n_splits=10, shuffle=True, random_state=42)
zscore = StandardScaler()

df1 = df[choq + ['classe']].copy()
df2 = df[choq + atributos_anterioes + ['classe']].copy()
df3 = df[choq + atributos_novos + ['classe']].copy()

x1 = df1.drop('classe', axis=1).copy()
y1 = df1['classe'].copy()
x1 = zscore.fit_transform(x1)

x2 = df2.drop('classe', axis=1).copy()
y2 = df2['classe'].copy()
x2 = zscore.fit_transform(x2)

x3 = df3.drop('classe', axis=1).copy()
y3 = df3['classe'].copy()
x3 = zscore.fit_transform(x3)

x4 = df.drop('classe', axis=1).copy()
y4 = df['classe'].copy()
x4 = zscore.fit_transform(x4)

In [13]:
x_train, x_val, y_train, y_val = train_test_split(x1, y1, test_size=validation_ratio, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=test_ratio, random_state=42)

baseline = DecisionTreeClassifier(random_state=42, max_depth=10, criterion='entropy')
florestaRandomica = RandomForestClassifier(random_state=42, verbose=0, n_jobs=n_jobs, max_depth=10, criterion='gini', n_estimators=50)
perceptronMulticamadas = MLPClassifier(random_state=42, max_iter=100, n_iter_no_change=20, batch_size=128, early_stopping=True, solver='adam', shuffle=True, activation='relu', hidden_layer_sizes=(50, 50))
svm = LinearSVC(random_state=42, verbose=0, max_iter=100, C=1.0, loss='squared_hinge', penalty='l2')
modeloML(baseline, x_train, y_train, x_val, y_val, cv, 'Árvore de Decisão (Baseline)')
modeloML(florestaRandomica, x_train, y_train, x_val, y_val, cv, 'Floresta Randômica')
modeloML(perceptronMulticamadas, x_train, y_train, x_val, y_val, cv, 'Perceptron Multicamadas')
modeloML(svm, x_train, y_train, x_val, y_val, cv, 'SVM')

Árvore de Decisão (Baseline)
precisao 70.66
revocacao 82.19
f1-score 75.99
acuracia 72.98

Floresta Randômica
precisao 71.98
revocacao 84.27
f1-score 77.64
acuracia 74.75

Perceptron Multicamadas
precisao 73.53
revocacao 81.95
f1-score 77.51
acuracia 75.27

SVM
precisao 74.48
revocacao 71.03
f1-score 72.71
acuracia 72.27



In [14]:
x_train, x_val, y_train, y_val = train_test_split(x2, y2, test_size=validation_ratio, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=test_ratio, random_state=42)

baseline = DecisionTreeClassifier(random_state=42, max_depth=10, criterion='entropy')
florestaRandomica = RandomForestClassifier(random_state=42, verbose=0, n_jobs=n_jobs, max_depth=10, criterion='gini', n_estimators=50)
perceptronMulticamadas = MLPClassifier(random_state=42, max_iter=100, n_iter_no_change=20, batch_size=128, early_stopping=True, solver='adam', shuffle=True, activation='relu', hidden_layer_sizes=(50, 50))
svm = LinearSVC(random_state=42, verbose=0, max_iter=100, C=1.0, loss='squared_hinge', penalty='l2')
modeloML(baseline, x_train, y_train, x_val, y_val, cv, 'Árvore de Decisão (Baseline)')
modeloML(florestaRandomica, x_train, y_train, x_val, y_val, cv, 'Floresta Randômica')
modeloML(perceptronMulticamadas, x_train, y_train, x_val, y_val, cv, 'Perceptron Multicamadas')
modeloML(svm, x_train, y_train, x_val, y_val, cv, 'SVM')

Árvore de Decisão (Baseline)
precisao 70.02
revocacao 81.33
f1-score 75.25
acuracia 72.18

Floresta Randômica
precisao 72.04
revocacao 84.87
f1-score 77.93
acuracia 74.99

Perceptron Multicamadas
precisao 73.84
revocacao 81.39
f1-score 77.43
acuracia 75.32

SVM
precisao 74.10
revocacao 71.79
f1-score 72.93
acuracia 72.27



In [15]:
x_train, x_val, y_train, y_val = train_test_split(x3, y3, test_size=validation_ratio, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=test_ratio, random_state=42)

baseline = DecisionTreeClassifier(random_state=42, max_depth=10, criterion='entropy')
florestaRandomica = RandomForestClassifier(random_state=42, verbose=0, n_jobs=n_jobs, max_depth=10, criterion='gini', n_estimators=50)
perceptronMulticamadas = MLPClassifier(random_state=42, max_iter=100, n_iter_no_change=20, batch_size=128, early_stopping=True, solver='adam', shuffle=True, activation='relu', hidden_layer_sizes=(50, 50))
svm = LinearSVC(random_state=42, verbose=0, max_iter=100, C=1.0, loss='squared_hinge', penalty='l2')
modeloML(baseline, x_train, y_train, x_val, y_val, cv, 'Árvore de Decisão (Baseline)')
modeloML(florestaRandomica, x_train, y_train, x_val, y_val, cv, 'Floresta Randômica')
modeloML(perceptronMulticamadas, x_train, y_train, x_val, y_val, cv, 'Perceptron Multicamadas')
modeloML(svm, x_train, y_train, x_val, y_val, cv, 'SVM')

Árvore de Decisão (Baseline)
precisao 70.28
revocacao 82.04
f1-score 75.71
acuracia 72.61

Floresta Randômica
precisao 72.35
revocacao 84.45
f1-score 77.93
acuracia 75.12

Perceptron Multicamadas
precisao 74.09
revocacao 81.21
f1-score 77.49
acuracia 75.45

SVM
precisao 75.31
revocacao 71.88
f1-score 73.56
acuracia 73.11



In [16]:
x_train, x_val, y_train, y_val = train_test_split(x4, y4, test_size=validation_ratio, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=test_ratio, random_state=42)

baseline = DecisionTreeClassifier(random_state=42, max_depth=10, criterion='entropy')
florestaRandomica = RandomForestClassifier(random_state=42, verbose=0, n_jobs=n_jobs, max_depth=10, criterion='gini', n_estimators=50)
perceptronMulticamadas = MLPClassifier(random_state=42, max_iter=100, n_iter_no_change=20, batch_size=128, early_stopping=True, solver='adam', shuffle=True, activation='relu', hidden_layer_sizes=(50, 50))
svm = LinearSVC(random_state=42, verbose=0, max_iter=100, C=1.0, loss='squared_hinge', penalty='l2')
modeloML(baseline, x_train, y_train, x_val, y_val, cv, 'Árvore de Decisão (Baseline)')
modeloML(florestaRandomica, x_train, y_train, x_val, y_val, cv, 'Floresta Randômica')
modeloML(perceptronMulticamadas, x_train, y_train, x_val, y_val, cv, 'Perceptron Multicamadas')
modeloML(svm, x_train, y_train, x_val, y_val, cv, 'SVM')

Árvore de Decisão (Baseline)
precisao 69.65
revocacao 81.50
f1-score 75.11
acuracia 71.90

Floresta Randômica
precisao 73.00
revocacao 85.27
f1-score 78.66
acuracia 75.93

Perceptron Multicamadas
precisao 75.12
revocacao 81.86
f1-score 78.34
acuracia 76.46

SVM
precisao 75.71
revocacao 73.28
f1-score 74.48
acuracia 73.87

