# Avaliando as features selecionadas

Agora, vamos treinar o modelo com as features selecionadas e ver se elas realmente trazem ganho para o modelo.

In [1]:
#importando as bibliotecas que serão utilizadas no processo
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score

import FeatureGenerator as fg

from lightgbm import LGBMClassifier

In [2]:
#carregando os dados de treino
data_path = "../inputs/train.csv"
df_train = pd.read_csv(data_path)
df_train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,target
0,28104,50000.0,2,1,1,31,1,2,2,0,...,50332.0,29690.0,30246.0,2200.0,4.0,2300.0,1100.0,1400.0,1200.0,1
1,29094,330000.0,2,2,2,59,0,0,0,0,...,80589.0,76180.0,61693.0,20000.0,3500.0,19000.0,15000.0,3000.0,2139.0,0
2,11280,220000.0,2,1,2,41,-1,-1,-2,-2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,28435,50000.0,2,2,1,45,0,0,0,0,...,8469.0,8411.0,8361.0,2124.0,2037.0,1130.0,295.0,302.0,296.0,0
4,10873,480000.0,2,3,1,42,-2,-2,-2,-2,...,0.0,790.0,0.0,0.0,0.0,0.0,790.0,0.0,0.0,0


In [3]:
#selecionando as variáveis categóricas para definir no modelo
cat_cols = ['SEX',
 'EDUCATION',
 'MARRIAGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6']

In [4]:
#o lightgbm irá converter para NaN todas as variáveis categóricas que forem menor que 0
#para esses casos, teremos que aplicar um tratamento diferente
#vamos escolher um valor qualquer negativo para multiplicar as categorias negativas e criar novas categorias positivas
def negative_cat(value):
    if value < 0:
        value = value*(-15)
        
    else:
        pass
    
    return value

In [5]:
#aplicando a função
for c in cat_cols:
    df_train[c] = df_train[c].apply(negative_cat)

In [None]:
# #criando uma lista com as features que foram selecionadas 
# selected_features = ['LIMIT_BAL',
#  'BILL_AMT2_minus_BILL_AMT1',
#  'BILL_AMT5_minus_BILL_AMT4',
#  'PAY_AMT5/BILL_AMT5',
#  'PAY_AMT3/BILL_AMT3',
#  'BILL_AMT3_minus_BILL_AMT2',
#  'PAY_AMT4/BILL_AMT4',
#  'BILL_AMT4_minus_BILL_AMT3',
#  'AGE',
#  'BILL_AMT6_minus_BILL_AMT5',
#  'PAY_AMT2/BILL_AMT2',
#  'PAY_AMT1/BILL_AMT1',
#  'PAY_AMT6/BILL_AMT6',
#  'PAY_AMT6_minus_PAY_AMT5',
#  'BILL_AMT1',
#  'PAY_AMT6',
#  'PAY_AMT1',
#  'PAY_AMT4_minus_PAY_AMT3',
#  'PAY_AMT5_minus_PAY_AMT4',
#  'PAY_AMT2_minus_PAY_AMT1',
#  'PAY_AMT2',
#  'PAY_0',
#  'PAY_AMT3_minus_PAY_AMT2',
#  'PAY_AMT3',
#  'PAY_AMT4',
#  'BILL_AMT6',
#  'BILL_AMT2',
#  'BILL_AMT4']

In [6]:
#inicializando as classes que serão utilizadas para geração de features
#a ideia de utilizar classes é poder encapsular tudo num só pipeline

#classe para criar as features de divisão
fbf = fg.FeatureByFeature(features_num=["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"],
                         features_denom=["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])

#classe para criar as features de diferença
diff1 = fg.DiffFeatures(features=["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"])

diff2 = fg.DiffFeatures(features=["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])

#classe para dropar features que não serão necessárias
dc = fg.DropCols(features=["ID"])

In [7]:
#criando input e output
X_train, y_train = df_train.drop(columns=["target"], axis=1), df_train.target.values

In [8]:
#o lightgbm só aceita index para identificação das colunas categoricas
cat_idx = []
for c in cat_cols:
    idx = X_train.columns.get_loc(c) - 1
    cat_idx.append(idx)

In [9]:
#inicializando o lightgbm
base_estimator = LGBMClassifier(categorical_features=cat_idx,
                              class_weight="balanced",
                              random_state=42)

In [10]:
#definindo o nosso pipeline com as transformações e o algoritmo final
pipe = Pipeline(steps=[("DropCols", dc),
                       ("FeatureByFeature", fbf),
                      ("diff_features1", diff1),
                      ("diff_features2", diff2),
                      ("Estimator", base_estimator)])

In [11]:
#vamos utilizar a validação cruzada para avaliar o modelo em diferentes partições do dataset
cross_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
#resultados da validação cruzada
print(cross_val_score(pipe, X_train, y_train, cv=cross_val, scoring="f1"))

[0.54778325 0.52057511 0.55443645 0.52394917 0.55439642]


In [13]:
#resultado médio da validação cruzada
print(np.mean([0.54778325, 0.52057511, 0.55443645, 0.52394917, 0.55439642]))

0.54022808
