# Treinamento do modelo final

Até aqui, todos os nossos desenvolvimentos se basearam em treinar o modelo em uma base de treino, tendo uma base de validação para avaliar o modelo ao final de tudo. Nesse momento, onde já temos todas as definições em relação à modelagem, vamos treinar o modelo com a configuração escolhida em toda a base disponível

In [1]:
#carregando as bibliotecas necessárias
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score

import FeatureGenerator as fg

from joblib import dump, load

from lightgbm import LGBMClassifier

In [2]:
#carregando ambas as bases que temos disponíveis
df_train = pd.read_csv("../inputs/train.csv")
df_val = pd.read_csv("../inputs/validation.csv")

In [3]:
#concatenando as partições para termos a base completa
data = pd.concat([df_train, df_val])

In [4]:
#criando input e output
X, y = data.drop(columns='target', axis=1), data.target.values

In [5]:
#features selecionadas do processo de feature selection
selected_features = ['BILL_AMT2_minus_BILL_AMT1',
 'PAY_AMT5/BILL_AMT5',
 'PAY_AMT6/BILL_AMT6',
 'BILL_AMT3_minus_BILL_AMT2',
 'PAY_AMT4/BILL_AMT4',
 'PAY_AMT1/BILL_AMT1',
 'PAY_AMT4_minus_PAY_AMT3',
 'PAY_AMT2/BILL_AMT2',
 'BILL_AMT5_minus_BILL_AMT4',
 'BILL_AMT4_minus_BILL_AMT3',
 'AGE',
 'BILL_AMT6_minus_BILL_AMT5',
 'PAY_AMT3/BILL_AMT3',
 'PAY_AMT2_minus_PAY_AMT1',
 'PAY_AMT6_minus_PAY_AMT5',
 'PAY_AMT5_minus_PAY_AMT4',
 'LIMIT_BAL/EDUCATION_max',
 'PAY_AMT2',
 'PAY_AMT3_minus_PAY_AMT2',
 'LIMIT_BAL/EDUCATION_mean',
 'PAY_AMT1',
 'BILL_AMT5/EDUCATION_min',
 'LIMIT_BAL',
 'PAY_AMT6',
 'PAY_0',
 'PAY_AMT3',
 'PAY_AMT4',
 'BILL_AMT1',
 'PAY_AMT5',
 'BILL_AMT1/EDUCATION_max',
 'BILL_AMT4/EDUCATION_min',
 'BILL_AMT2/EDUCATION_min',
 'BILL_AMT3/EDUCATION_min',
 'BILL_AMT6/EDUCATION_min']

In [6]:
#inicializando as classes que serão utilizadas para geração de features
#a ideia de utilizar classes é poder encapsular tudo num só pipeline

#classe para criar as features de divisão
fbf = fg.FeatureByFeature(features_num=["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"],
                         features_denom=["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])

#classe para criar as features de diferença
diff1 = fg.DiffFeatures(features=["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"])

diff2 = fg.DiffFeatures(features=["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])

#classe para criar as features de agrupamento
agpf = fg.GroupFeatures(group_columns=["EDUCATION"], features=["LIMIT_BAL", "BILL_AMT1", 
                                                               "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", 
                                                               "BILL_AMT5", "BILL_AMT6"])
#classe para filtrar as features que serão usadas como input
final_features = fg.FinalFeatures(features=selected_features)

In [7]:
#criando nosso dicionário de parâmetros para configurar o modelo
selected_params = {'reg_lambda': 10,
 'reg_alpha': 2,
 'num_leaves': 12,
 'n_estimators': 800,
 'min_child_samples': 40,
 'max_depth': 2,
 'learning_rate': 0.02,
 'colsample_bytree': None}

In [9]:
#inicializando um novo lightgbm, agora com os parâmetros escolhidos
estimator = LGBMClassifier(**selected_params,
                                class_weight="balanced",
                                random_state=42)

In [11]:
#configurando o pipeline com o novo modelo inicializado
pipe = Pipeline(steps=[("FeatureByFeature", fbf),
                      ("diff_features1", diff1),
                      ("diff_features2", diff2),
                      ("aggrouped_features", agpf),
                      ("FinalFeatures", final_features),
                      ("Estimator", estimator)])

In [12]:
#fit
pipe.fit(X, y)

Pipeline(steps=[('FeatureByFeature',
                 FeatureByFeature(features_denom=['BILL_AMT1', 'BILL_AMT2',
                                                  'BILL_AMT3', 'BILL_AMT4',
                                                  'BILL_AMT5', 'BILL_AMT6'],
                                  features_num=['PAY_AMT1', 'PAY_AMT2',
                                                'PAY_AMT3', 'PAY_AMT4',
                                                'PAY_AMT5', 'PAY_AMT6'])),
                ('diff_features1',
                 DiffFeatures(features=['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
                                        'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'])),
                ('diff_features...
                                         'LIMIT_BAL/EDUCATION_mean', 'PAY_AMT1',
                                         'BILL_AMT5/EDUCATION_min', 'LIMIT_BAL',
                                         'PAY_AMT6', 'PAY_0', 'PAY_AMT3',
                                         'PAY_AMT4', '

In [13]:
#salvando nosso pipeline inteiro em .pkl
dump(pipe, "../app/model.pkl")

['../app/model.pkl']