# Previsão de Engajamento - Classificação

Este notebook define modelos e hiperparâmetros e executa um validação cruzada aninhada com grid search para otimização de hiperparâmetros e treinamento.

- Entrada: `full-preproc2-inputs_(NOME_DO_MODELO)_(REDE_SOCIAL)` 
e `full-preproc2-outputs_(NOME_DO_MODELO)_(REDE_SOCIAL)`
- Saída: `resultados/resultados50p_{START_DATE_STR}_{SUB_DATASET}_(NOME_DO_MODELO)_(REDE_SOCIAL).npy`


In [1]:
#@title Importações de pacote
import pandas as pd

In [2]:
BASE_PATH = 'dados/preprocessed/'
MODEL_NAME = 'mxbai-embed-large-v1'
SOCIAL_NETWORK = 'tiktok'

# 1 - Carregando os Dados

Carregando dados brutos gerais.

In [3]:
dfx = pd.read_excel(f"{BASE_PATH}full-preproc2-inputs_{MODEL_NAME}_{SOCIAL_NETWORK}.xlsx", index_col='ID')
dfx.drop(columns=["Only Hashtags"],inplace=True)
dfx.head()

Unnamed: 0_level_0,Candidato_Bolsonaro,Candidato_Lula,Dias Decorridos,x1,x2,x3,x4,x5,x6,x7,...,x1015,x1016,x1017,x1018,x1019,x1020,x1021,x1022,x1023,x1024
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7115033431473474822,0,1,94,-0.136752,-0.004471,-0.197404,0.771875,0.271732,-0.831048,0.572798,...,0.991231,-0.179094,0.009374,-0.510456,0.732227,-0.25317,0.185578,0.151233,-0.44379,-0.47907
7115174031162215686,0,1,94,-0.202921,0.046833,-0.182256,0.666116,0.354371,-0.201803,0.526789,...,1.219669,-0.223046,-0.21473,-0.466289,0.785575,-0.121878,0.218252,0.497084,-0.66233,0.082123
7115357413712153861,0,1,93,0.029744,0.393452,-0.301938,0.995043,-0.297031,-0.431491,1.016083,...,0.703671,0.198557,-0.226507,-0.561226,0.717053,0.401612,-0.630034,-0.050426,-0.719983,-0.286128
7115560675824422149,0,1,93,-0.185425,0.782692,-0.086695,0.831251,0.337393,-0.693311,0.471492,...,0.841613,-0.037051,-0.316676,-0.510069,0.825399,0.095525,-0.296313,0.055677,-0.575535,-0.053547
7115793869152734470,0,1,92,0.068613,-0.065852,-0.359612,0.638832,-0.074697,-0.239683,0.304781,...,0.919486,0.173887,-0.563785,-0.028777,0.640279,0.146346,-0.096667,0.012495,-0.735688,-0.274756


In [4]:
dfy_full = pd.read_excel(f"{BASE_PATH}full-preproc2-outputs_{MODEL_NAME}_{SOCIAL_NETWORK}.xlsx", index_col='ID')
dfy_full.head()

Unnamed: 0_level_0,Curtidas,Curtidas-Log,Curtidas-MinMax,Curtidas-Log-MinMax,Curtidas-2Classes-50p
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7115033431473474822,11700,4.068186,0.005881,0.278511,0
7115174031162215686,33600,4.526339,0.018777,0.431384,1
7115357413712153861,34600,4.539076,0.019365,0.435634,1
7115560675824422149,47500,4.676694,0.026961,0.481554,1
7115793869152734470,22400,4.350248,0.012182,0.372628,1


## 1.1 - Escolha do Dataset (ALTERE)

In [5]:
from util import filter_dataset

# 'geral', 'lula' ou 'bolsonaro'
SUB_DATASET = 'lula'

TARGET_COL = 'Curtidas-2Classes-50p'

In [6]:
dfx, dfy = filter_dataset(dfx, dfy_full, SUB_DATASET, TARGET_COL)

Dataset LULA (indicador 1)


In [7]:
dfx.columns, dfx.shape

(Index(['Dias Decorridos', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9',
        ...
        'x1015', 'x1016', 'x1017', 'x1018', 'x1019', 'x1020', 'x1021', 'x1022',
        'x1023', 'x1024'],
       dtype='object', length=1025),
 (308, 1025))

In [8]:

dfy.head()
#dfy.shape

ID
7115033431473474822    0
7115174031162215686    1
7115357413712153861    1
7115560675824422149    1
7115793869152734470    1
Name: Curtidas-2Classes-50p, dtype: int64

## 1.2 - Informações do Treinamento (ALTERE)

In [9]:
from datetime import datetime

RANDOM_STATE = 1231

START_DATE_STR = datetime.now().strftime("%Y-%m-%d-%Hh%Mm")
#print(start_date_str)

metadata = dict()
metadata['date'] = START_DATE_STR
metadata['subdataset'] = SUB_DATASET
metadata['target'] = TARGET_COL
metadata['random_state'] = RANDOM_STATE

metadata['description'] =  f'''Treinamento com modelos com os 
dados balanceados (50p), com o dataset {SUB_DATASET}, com todos os modelos 
e com novos ENSEMBLE'''

# 2 - Modelos

In [10]:
from sklearn.pipeline import Pipeline
from data_transformations_util import IdentityTransformer

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
#'''
lista_modelos = [
    {
        "nome_do_modelo": 'MLP Neural Network',
        "estimador": Pipeline([
            ('pca', PCA()),
            ('scaler', StandardScaler()), 
            ('predictor', MLPClassifier(random_state=42))
        ]),
        "parametros": {
            'pca': [PCA(n_components=10), PCA(n_components=20), PCA(n_components=30)],
            'scaler': [IdentityTransformer(), StandardScaler(), MinMaxScaler()],
            'predictor__hidden_layer_sizes': [(4,), (8,), (32,), (128,)],  # Number of neurons in the hidden layer
            'predictor__learning_rate_init': [0.001, 0.01, 0.05],        # Learning rate
        }
    },
    {
        "nome_do_modelo": 'Support Vector Machine',
        "estimador": Pipeline([
            ('pca', PCA()),
            ('scaler', StandardScaler()), 
            ('predictor', SVC(random_state=42))   #SVC(probability=True) # muito lento?  Tempo médio de treinamento sem proba: 21.55 segundos / com proba: 35.11 segundos
        ]),
        "parametros": {
            'pca': [PCA(n_components=10), PCA(n_components=20), PCA(n_components=30)],
            'scaler': [IdentityTransformer(), StandardScaler(), MinMaxScaler()],
            'predictor__class_weight': ['balanced', None], 
            'predictor__C': [0.1, 1.0, 10.0, 20.0, 50.0],
            'predictor__gamma': ['scale', 'auto'],
            'predictor__kernel': ['rbf', 'sigmoid','linear']
        }
    },
    {
        "nome_do_modelo": 'Random Forest',
        "estimador": Pipeline([
            ('pca', PCA()),
            ('predictor', RandomForestClassifier(random_state=42))
        ]),
        "parametros": {
            'pca': [PCA(n_components=10), PCA(n_components=20), PCA(n_components=30)],
            'predictor__n_estimators': [10, 30, 70], 
            #'predictor__class_weight': ['balanced', None], 
            'predictor__max_depth': [3, 4, 5, None],
            'predictor__min_samples_split': [2, 4, 8]
        }
    },
#'''
#lista_modelos = [
    {
        "nome_do_modelo": 'Logistic Regression',
        "estimador": Pipeline([
            ('pca', PCA()),
            ('scaler', StandardScaler()), 
            ('predictor', LogisticRegression(solver='saga', random_state=42))   # Saga is the only solver to support the 3 options for 'penalty'
        ]),
        "parametros": {
            'pca': [PCA(n_components=10), PCA(n_components=20), PCA(n_components=30)],
            'scaler': [IdentityTransformer(), StandardScaler(), MinMaxScaler()],
            #'predictor__class_weight': ['balanced', None], 
            'predictor__C': [0.01, 0.1, 1.0, 2.0],         # Regularization parameter
            'predictor__penalty': ['l1', 'l2', None]       # Penalty term
        }
    },
    {
        "nome_do_modelo": 'KNN',
        "estimador": Pipeline([
            ('pca', PCA),
            ('scaler', StandardScaler()), 
            ('predictor', KNeighborsClassifier())
        ]),
        "parametros": {
            'pca': [PCA(n_components=10), PCA(n_components=20), PCA(n_components=30)],
            'scaler': [IdentityTransformer(), StandardScaler(), MinMaxScaler()],
            'predictor__n_neighbors': [5, 10, 15, 20], 
            'predictor__weights': ['uniform', 'distance'], 
            'predictor__metric': ['manhattan', 'euclidean', 'cosine']
        }
    },

]

In [13]:
BASIC_MODEL_NAMES = [ mdl["nome_do_modelo"] for mdl in lista_modelos ]
metadata['basic_models'] = BASIC_MODEL_NAMES

BASIC_MODEL_NAMES

['MLP Neural Network',
 'Support Vector Machine',
 'Random Forest',
 'Logistic Regression',
 'KNN']

# 3 - Treina Modelos Básicos

In [None]:
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from classification_train_util import nested_cross_validation_grid_search

A linha abaixo executa nested cross-validation, com um grid search nos *inner folds*, para cada modelo.

In [15]:

outer_folds = 5
outer_folds_repetitions = 2
cv_outer=RepeatedStratifiedKFold(n_splits=outer_folds, n_repeats=outer_folds_repetitions, random_state=RANDOM_STATE)

inner_folds = 5
cv_inner=StratifiedKFold(n_splits=inner_folds, shuffle=True)

last_run_basic = nested_cross_validation_grid_search(
    lista_modelos, 
    dfx, 
    dfy, 
    cv_outer=cv_outer,
    cv_inner=cv_inner
)


.....
-- coletando e armazenando resultados --

 - Acurácia   : 0.5080 +/- 0.0544
 - Precisão   : 0.5063 +/- 0.0520
 - Revocação  : 0.5166 +/- 0.1056
 - F1 - Score : 0.5084 +/- 0.0707
 - ROC - AUC  : 0.5157 +/- 0.0748
 - PR - AUC   : 0.5557 +/- 0.0685
 - Tempo médio de treinamento: 55.37 segundos


Terminado em 02/04/2025 19:54:40


# 4 - Treina Ensembles

In [16]:
from copy import deepcopy

from ensemble_train_util import extract_all_best_models, extract_best_models_of_fold
from ensemble_train_util import train_ensemble

In [17]:
last_run_all = deepcopy(last_run_basic)

In [None]:
#total_outer_folds = (outer_folds * outer_folds_repetitions)
total_outer_folds = cv_outer.get_n_splits()

for fold_i in range(total_outer_folds):
    models = extract_best_models_of_fold(last_run_basic, fold_i)
    results_ensemble = train_ensemble(f"FOLD_{fold_i:02d}", models, dfx, dfy, cv_outer) 
    
    # check if keys have no intersection (no common key), then merge results
    assert len(set(last_run_all.keys()) & set(results_ensemble.keys())) == 0
    last_run_all.update(results_ensemble)

In [19]:
# Atenção: tem data leakage! Ver explicação na célula abaixo. 
# Mantive apenas para comparação!

# Treina 2 ensemble mistos (versões hard e soft), usando o modelo obtido em cada fold externo (o modelo pode ser de qualquer tipo)
models = extract_all_best_models(last_run_basic, total_outer_folds)
results_ensemble = train_ensemble("Best", models, dfx, dfy, cv_outer) 

# check if keys have no intersection (no common key), then merge results
assert len(set(last_run_all.keys()) & set(results_ensemble.keys())) == 0
last_run_all.update(results_ensemble)

....................

In [20]:
'''
# Atenção: Tem data leakage! 
# Explicação:
# - modelos escolhidas com base em diferentes folds 
# - assim, o ensemble se baseia em todo o dataset

from ensemble_train_util import extract_best_models_of_type

models_list = ['MLP Neural Network', 'Support Vector Machine', 'Random Forest', 'Logistic Regression', 'KNN']

for model_name in BASIC_MODEL_NAMES:
    print(f"\n\n\n **** TREINANDO ENSEMBLE PARA {model_name} ****\n")
    contracted_name = ''.join([name[0] for name in model_name.split()])
    models = extract_best_models_of_type(last_run_all, model_name)
    results_ensemble = train_ensemble(contracted_name, models, dfx, dfy, cv_outer)

    # check if keys have no intersection (no common key), then merge results
    assert len(set(last_run_all.keys()) & set(results_ensemble.keys())) == 0
    last_run_all.update(results_ensemble)

#''';

# 5 - Salva Resultados

In [21]:
from util import save_results

In [22]:
last_run_all.keys()

dict_keys(['MLP Neural Network', 'Support Vector Machine', 'Random Forest', 'Logistic Regression', 'KNN', 'FOLD_00-hard-vote', 'FOLD_00-soft-vote', 'FOLD_01-hard-vote', 'FOLD_01-soft-vote', 'FOLD_02-hard-vote', 'FOLD_02-soft-vote', 'FOLD_03-hard-vote', 'FOLD_03-soft-vote', 'FOLD_04-hard-vote', 'FOLD_04-soft-vote', 'FOLD_05-hard-vote', 'FOLD_05-soft-vote', 'FOLD_06-hard-vote', 'FOLD_06-soft-vote', 'FOLD_07-hard-vote', 'FOLD_07-soft-vote', 'FOLD_08-hard-vote', 'FOLD_08-soft-vote', 'FOLD_09-hard-vote', 'FOLD_09-soft-vote', 'Best-hard-vote', 'Best-soft-vote'])

In [23]:
OUTPUT_FILE_PATH = f'resultados/resultados50p_{START_DATE_STR}_{SUB_DATASET}({MODEL_NAME})_({SOCIAL_NETWORK}).npy'

save_results(OUTPUT_FILE_PATH, last_run_all, metadata)

print("Arquivo salvo:", OUTPUT_FILE_PATH)

Arquivo salvo: resultados/resultados50p_2025-04-02-18h10m_lula(mxbai-embed-large-v1)_(tiktok).npy


----

# FIM