# Previsão de Engajamento - Classificação

Este notebook define modelos e hiperparâmetros e executa um validação cruzada aninhada com grid search para otimização de hiperparâmetros e treinamento.

- Entrada: `full-preproc2-inputs` e `full-preproc2-outputs`
- Saída: `resultados/resultados50p_{START_DATE_STR}_{SUB_DATASET}.npy`


In [189]:
#@title Importações de pacote
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [190]:
BASE_PATH = 'dados/'
EMBEDDINGS_PATH = BASE_PATH +'preprocessed/'

# 1 - Carregando os Dados

Carregando dados brutos gerais.

In [191]:
dfx = pd.read_excel(EMBEDDINGS_PATH + 'full-preproc2-inputs.xlsx', index_col='ID')
dfx.drop(columns=["Only Hashtags"],inplace=True)
dfx.head()

Unnamed: 0_level_0,Candidato_Bolsonaro,Candidato_Lula
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
7114971700365692165,1,0
7115033431473474822,0,1
7115050482179050758,1,0
7115120078982630661,1,0
7115161088219565317,1,0


In [192]:
dfy_full = pd.read_excel(EMBEDDINGS_PATH + 'full-preproc2-outputs.xlsx', index_col='ID')
dfy_full.head()

Unnamed: 0_level_0,Curtidas,Curtidas-Log,Curtidas-MinMax,Curtidas-Log-MinMax,Curtidas-2Classes-50p
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7114971700365692165,24400,4.38739,0.003881,0.299717,0
7115033431473474822,11700,4.068186,0.00397,0.281879,0
7115050482179050758,9163,3.962038,0.001197,0.173869,0
7115120078982630661,3485,3.542203,0.000197,0.049653,0
7115161088219565317,22100,4.344392,0.003476,0.286995,0


In [193]:
path_model_embedding = EMBEDDINGS_PATH + 'embeddings/' + 'Embeddings_all-MiniLM-paraphrase-multilingual.xlsx'
pe = pd.read_excel(path_model_embedding)

pe


Unnamed: 0,ID,Candidato,Curtidas,Only Hashtags,x1,x2,x3,x4,x5,x6,...,x375,x376,x377,x378,x379,x380,x381,x382,x383,x384
0,7114971700365692165,Jair Bolsonaro,24400,1.0,-0.035405,0.091901,0.029305,-0.098718,0.243411,0.111185,...,0.175389,-0.227796,-0.141506,-0.010863,0.124115,0.060907,0.403370,-0.108669,-0.133018,0.144618
1,7115033431473474822,Lula,11700,0.0,-0.222600,0.258170,0.165599,-0.210579,0.359918,-0.001123,...,0.370087,-0.112563,-0.277257,0.095219,-0.293415,0.101862,0.038032,0.054848,0.059788,0.128184
2,7115050482179050758,Jair Bolsonaro,9163,1.0,0.024929,0.079934,0.098492,-0.100231,0.224104,0.025696,...,0.168720,-0.311732,-0.105087,-0.059519,0.052269,0.108190,0.341778,-0.142647,-0.096181,0.125113
3,7115120078982630661,Jair Bolsonaro,3485,1.0,0.037256,0.072214,-0.000965,-0.107916,0.288683,0.103231,...,0.202172,-0.211109,-0.122603,0.045138,0.120379,0.205129,0.261554,-0.083097,-0.185110,0.220620
4,7115161088219565317,Jair Bolsonaro,22100,1.0,0.184212,-0.063834,-0.014328,-0.073712,0.067048,-0.046129,...,0.109674,-0.212620,-0.229578,-0.191411,0.090878,0.120460,0.276041,-0.130134,-0.236629,0.108136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3355,18041421523376097,Jair Bolsonaro,2816298,0.0,-0.154099,0.030383,-0.181850,-0.035343,0.562470,0.228206,...,0.163863,-0.138396,-0.090157,-0.139640,-0.059515,-0.016232,0.274704,-0.026963,-0.197315,0.070981
3356,211857482296579_737954751015282,Jair Bolsonaro,196046,0.0,-0.010288,-0.002675,-0.097292,-0.056652,0.546457,0.185445,...,0.241903,-0.172705,-0.021164,-0.151274,0.138189,-0.128408,0.292017,0.000566,-0.133829,0.088963
3357,267949976607343_735093754644310,Lula,64214,0.0,-0.074462,0.045235,0.054071,0.110205,0.167678,-0.059837,...,0.257696,-0.145697,-0.081612,0.097586,0.031411,0.010342,0.241534,-0.127639,-0.266172,0.174676
3358,211857482296579_737980491012708,Jair Bolsonaro,557648,0.0,-0.136056,0.375990,-0.089333,0.151647,0.215663,-0.089546,...,0.035895,-0.107625,0.034370,-0.070815,-0.152468,-0.012056,0.162576,-0.227032,0.103442,0.101070


## 1.1 - Escolha do Dataset (ALTERE)

In [194]:
filter_lula = (dfx['Candidato_Lula'] == 1)
filter_bolsonaro = (dfx['Candidato_Bolsonaro'] == 1)
dy_lula = dfx[filter_lula].copy()
dy_bolsonaro = dfx[filter_bolsonaro].copy()

filter_lula_y      = (pe['Candidato'] == 'Lula')
filter_bolsonaro_y = (pe['Candidato'] == 'Jair Bolsonaro')
dx_lula = pe[filter_lula_y].copy()
dx_bolsonaro = pe[filter_bolsonaro_y].copy()

dx_lula.drop(columns=["Candidato", "Only Hashtags", "Curtidas"], inplace=True)
dx_lula.set_index('ID', inplace=True)

## 1.2 - Informações do Treinamento (ALTERE)

In [195]:
from datetime import datetime

RANDOM_STATE = 1231

START_DATE_STR = datetime.now().strftime("%Y-%m-%d-%Hh%Mm")
#print(start_date_str)

metadata = dict()
metadata['date'] = START_DATE_STR
metadata['subdataset'] = SUB_DATASET
metadata['target'] = TARGET_COL
metadata['random_state'] = RANDOM_STATE

metadata['description'] =  f'''Treinamento com modelos com os 
dados balanceados (50p), com o dataset {SUB_DATASET}, com todos os modelos 
e com todos os ENSEMBLE, com a correção de escala para a MLP,
variando class_weight para os modelos que suportam'''

# 2 - Modelos

In [196]:
from sklearn.pipeline import Pipeline
from data_transformations_util import IdentityTransformer

In [197]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [198]:
lista_modelos = [
    {
        "nome_do_modelo": 'MLP Neural Network',
        "estimador": Pipeline([
            ('pca', PCA()),
            ('scaler', StandardScaler()), 
            ('predictor', MLPClassifier(random_state=42))
        ]),
        "parametros": {
            'pca': [PCA(n_components=0.50), PCA(n_components=0.65), PCA(n_components=0.80), IdentityTransformer()],
            'scaler': [IdentityTransformer(), StandardScaler(), MinMaxScaler()],
            'predictor__hidden_layer_sizes': [(256,), (512,), (1024,)],  # Number of neurons in the hidden layer
            'predictor__learning_rate_init': [0.001, 0.01, 0.05],        # Learning rate
        }
    },
    {
        "nome_do_modelo": 'Support Vector Machine',
        "estimador": Pipeline([
            ('pca', PCA()),
            ('scaler', StandardScaler()), 
            ('predictor', SVC(random_state=42))   #SVC(probability=True) # muito lento?  Tempo médio de treinamento sem proba: 21.55 segundos / com proba: 35.11 segundos
        ]),
        "parametros": {
            'pca': [PCA(n_components=0.50), PCA(n_components=0.65), PCA(n_components=0.80), IdentityTransformer()],
            'scaler': [IdentityTransformer(), StandardScaler(), MinMaxScaler()],
            'predictor__class_weight': ['balanced', None], 
            'predictor__C': [0.1, 1.0, 10.0, 20.0, 50.0],
            'predictor__gamma': ['scale', 'auto'],
            'predictor__kernel': ['rbf', 'sigmoid']  # removi 'linear' porque deixava lento!
        }
    },
    {
        "nome_do_modelo": 'Random Forest',
        "estimador": Pipeline([
            ('pca', PCA()),
            ('predictor', RandomForestClassifier(random_state=42))
        ]),
        "parametros": {
            'pca': [PCA(n_components=0.50), PCA(n_components=0.65), PCA(n_components=0.80), IdentityTransformer()],
            'predictor__n_estimators': [10, 30, 70], 
            'predictor__class_weight': ['balanced', None], 
            'predictor__max_depth': [3, 4, 5, None],
            'predictor__min_samples_split': [2, 4, 8]
        }
    },
    {
        "nome_do_modelo": 'Logistic Regression',
        "estimador": Pipeline([
            ('pca', PCA()),
            ('scaler', StandardScaler()), 
            ('predictor', LogisticRegression(solver='saga', random_state=42))   # Saga is the only solver to support the 3 options for 'penalty'
        ]),
        "parametros": {
            'pca': [PCA(n_components=0.50), PCA(n_components=0.65), PCA(n_components=0.80), IdentityTransformer()],
            'scaler': [IdentityTransformer(), StandardScaler(), MinMaxScaler()],
            'predictor__class_weight': ['balanced', None], 
            'predictor__C': [0.01, 0.1, 1.0, 2.0],         # Regularization parameter
            'predictor__penalty': ['l1', 'l2', None]       # Penalty term
        }
    },
    {
        "nome_do_modelo": 'KNN',
        "estimador": Pipeline([
            ('pca', PCA),
            ('scaler', StandardScaler()), 
            ('predictor', KNeighborsClassifier())
        ]),
        "parametros": {
            'pca': [PCA(n_components=0.50), PCA(n_components=0.65), PCA(n_components=0.80), IdentityTransformer()],
            'scaler': [IdentityTransformer(), StandardScaler(), MinMaxScaler()],
            'predictor__n_neighbors': [5, 10, 15, 20], 
            'predictor__weights': ['uniform', 'distance'], 
            'predictor__p': [1, 2]
        }
    },

]

In [199]:
BASIC_MODEL_NAMES = [ mdl["nome_do_modelo"] for mdl in lista_modelos ]
metadata['basic_models'] = BASIC_MODEL_NAMES

BASIC_MODEL_NAMES

['MLP Neural Network',
 'Support Vector Machine',
 'Random Forest',
 'Logistic Regression',
 'KNN']

# 3 - Treinamento

In [200]:
from classification_train_util import nested_cross_validation_grid_search


A linha abaixo executa nested cross-validation, com um grid search nos *inner folds*, para cada modelo.

In [201]:
last_run_basic = nested_cross_validation_grid_search(lista_modelos, dx_lula, dy_lula, 5, 5, rand_state=RANDOM_STATE)





 **** RESULTADO DOS MODELOS + CURVAS ROC E PR ****

Treinando modelo MLP Neural Network 

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

## 3.3 - Treina Ensembles (não foi publicado)

In [16]:
from copy import deepcopy

from ensemble_train_util import select_best_models_of_type_fn, select_best_models_per_fold
from ensemble_train_util import train_ensemble

In [17]:
last_run_all = deepcopy(last_run_basic)

NameError: name 'last_run_basic' is not defined

In [18]:
#models_list = ['MLP Neural Network', 'Support Vector Machine', 'Random Forest', 'Logistic Regression', 'KNN']

for model_name in BASIC_MODEL_NAMES:
    print(f"\n\n\n **** TREINANDO ENSEMBLE PARA {model_name} ****\n")
    contracted_name = ''.join([name[0] for name in model_name.split()])
    results_ensemble = train_ensemble(select_best_models_of_type_fn(model_name), last_run_all, dfx, dfy, name_prefix=contracted_name)

    # merge results with results_with_ensemble
    last_run_all.update(results_ensemble)




 **** TREINANDO ENSEMBLE PARA MLP Neural Network ****



NameError: name 'last_run_all' is not defined

In [None]:
results_ensemble = train_ensemble(select_best_models_per_fold, last_run_basic, dfx, dfy, name_prefix="BPF")  # BPF - best per fold

# check if keys have no intersection (no common key)
assert len(set(last_run_all.keys()) & set(results_ensemble.keys())) == 0

# merge results with results_with_ensemble
last_run_all.update(results_ensemble)

## 3.4 - Salva Resultados

In [None]:
from util import save_results

In [None]:
last_run_all.keys()

In [None]:
OUTPUT_FILE_PATH = f'resultados/resultados50p_{START_DATE_STR}_{SUB_DATASET}.npy'

save_results(OUTPUT_FILE_PATH, last_run_all, metadata)

print("Arquivo salvo:", OUTPUT_FILE_PATH)

----

# FIM