# Feature Engineering

## Configurações

In [27]:
# configurações
import importlib #quando necessário reimportar pacotes

import sys
sys.path.append("../")
sys.path.append("../ml-project-template")

import config
import utils
import data_manager as data_mgr
import feature_manager as feat_mgr
from config import NUMERICAL_FEATURES

import pandas as pd

In [3]:
data_manager = data_mgr.DataManager()
feature_manager = feat_mgr.FeatureEngineer()

## v0_basico (Apenas remoção de variáveis indesejadas)

Essa versão do database é útil para o catboost uma vez que não é necessário utilizar fazer o tratamento das variáveis categóricas, tratadas automaticamente dentro do algoritmo. 

Ele serve como uma versão de beanchmark para o catboost, identificando posteriores avanços com incorporações de técnicas de feature engeneering.

In [15]:
raw_train_df, raw_test_df = data_manager.load_raw_data(filelist = ['train.pkl', 'test.pkl'])

train.pkl - formato: (8000, 14)
test.pkl - formato: (2000, 14)


In [34]:
train_processed_v0 = feature_manager.remove_problematic_columns(raw_train_df)
train_processed_v0

️ Removendo colunas: ['RowNumber', 'CustomerId', 'Surname']


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
8769,622,France,Male,31,7,0.00,1,1,0,35408.77,0
3920,634,Germany,Male,43,3,212696.32,1,1,0,115268.86,0
3983,626,France,Female,44,2,0.00,1,0,1,173117.22,1
3944,612,France,Female,31,8,117989.76,1,1,1,54129.86,0
3279,652,Spain,Male,37,7,0.00,2,1,0,68789.93,0
...,...,...,...,...,...,...,...,...,...,...,...
3218,663,France,Male,36,10,0.00,2,1,0,136349.55,0
4134,678,France,Male,43,5,102338.19,1,1,1,79649.62,0
4453,809,Germany,Male,33,8,148055.74,1,0,0,199203.21,0
9346,716,France,Male,41,9,0.00,1,1,1,113267.48,0


In [35]:
test_processed_v0 = feature_manager.remove_problematic_columns(raw_test_df)
test_processed_v0

️ Removendo colunas: ['RowNumber', 'CustomerId', 'Surname']


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
4559,528,Germany,Female,62,7,133201.17,1,0,0,168507.68,1
2531,607,Germany,Male,47,4,148826.32,1,1,1,79450.61,0
7504,654,Spain,Female,32,2,0.00,1,1,1,51972.92,1
4879,783,Spain,Female,44,3,81811.71,1,1,0,164213.53,1
6418,754,Spain,Female,27,8,0.00,2,0,0,121821.16,0
...,...,...,...,...,...,...,...,...,...,...,...
7997,465,Germany,Male,24,6,156007.09,1,1,0,191368.37,0
2782,646,France,Male,24,8,0.00,2,0,0,92612.88,0
8552,468,France,Female,22,1,76318.64,1,1,1,194783.12,0
5562,620,Spain,Male,42,9,121490.05,1,1,1,29296.74,0


In [36]:
data_manager.save_processed_data(
    train_processed_v0, test_processed_v0, 
    feature_set_name = "v0_basico"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v0_basico


## v1_one-hot_encoding (v0_basico + one-hot encoding aplicado as variáveis categóricas)

O database v1_encoded é equivalente ao database v0_basico, mas para outros algoritmos que assim como o catboost não precisam de normalização das variáveis numéricas, mas que necessitam de one-hot enconding para sua utilização (ex.: xgboost, lightgbm, random forest).

In [37]:
train_v0, test_v0 = data_manager.load_processed_data(feature_set = "v0_basico")

 Carregando features: v0_basico
 FeatureSet info: 11 features


In [38]:
feature_manager.fit_onehot_encoder(train_v0)

<feature_manager.FeatureEngineer at 0x126e82e9a90>

In [39]:
train_processed_v1 = feature_manager.transform_with_onehot(train_v0)
train_processed_v1

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1
8769,622,31,7,0.00,1,35408.77,0,1.0,0.0,0.0,1.0,1.0,0.0
3920,634,43,3,212696.32,1,115268.86,0,0.0,1.0,0.0,1.0,1.0,0.0
3983,626,44,2,0.00,1,173117.22,1,1.0,0.0,0.0,0.0,0.0,1.0
3944,612,31,8,117989.76,1,54129.86,0,1.0,0.0,0.0,0.0,1.0,1.0
3279,652,37,7,0.00,2,68789.93,0,0.0,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,663,36,10,0.00,2,136349.55,0,1.0,0.0,0.0,1.0,1.0,0.0
4134,678,43,5,102338.19,1,79649.62,0,1.0,0.0,0.0,1.0,1.0,1.0
4453,809,33,8,148055.74,1,199203.21,0,0.0,1.0,0.0,1.0,0.0,0.0
9346,716,41,9,0.00,1,113267.48,0,1.0,0.0,0.0,1.0,1.0,1.0


In [41]:
test_processed_v1 = feature_manager.transform_with_onehot(test_v0)
test_processed_v1

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1
4559,528,62,7,133201.17,1,168507.68,1,0.0,1.0,0.0,0.0,0.0,0.0
2531,607,47,4,148826.32,1,79450.61,0,0.0,1.0,0.0,1.0,1.0,1.0
7504,654,32,2,0.00,1,51972.92,1,0.0,0.0,1.0,0.0,1.0,1.0
4879,783,44,3,81811.71,1,164213.53,1,0.0,0.0,1.0,0.0,1.0,0.0
6418,754,27,8,0.00,2,121821.16,0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997,465,24,6,156007.09,1,191368.37,0,0.0,1.0,0.0,1.0,1.0,0.0
2782,646,24,8,0.00,2,92612.88,0,1.0,0.0,0.0,1.0,0.0,0.0
8552,468,22,1,76318.64,1,194783.12,0,1.0,0.0,0.0,0.0,1.0,1.0
5562,620,42,9,121490.05,1,29296.74,0,0.0,0.0,1.0,1.0,1.0,1.0


In [42]:
data_manager.save_processed_data(
    train_processed_v1, test_processed_v1, 
    feature_set_name = "v1_one-hot_encoding"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v1_one-hot_encoding


## v2_one-hot_encoding_plus_normalizacao

Partindo do database v1, adiciona a padronização z-score (standar_scaler do sklearn) às variáveis núméricas. 

Portanto, é composto por one-hot encondig em variáveis categóricas + z-score em variáveis numéricas

In [51]:
train_v1, test_v1 = data_manager.load_processed_data(feature_set = "v1_one-hot_encoding")

 Carregando features: v1_one-hot_encoding
 FeatureSet info: 13 features


In [52]:
feature_manager.fit_standard_scaler(train_v1)

<feature_manager.FeatureEngineer at 0x126e82ea930>

In [53]:
train_processed_v2 = feature_manager.transform_with_standard_scaler(train_v1)
train_processed_v2

Unnamed: 0,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary
8769,0,1.0,0.0,0.0,1.0,1.0,0.0,-0.297153,-0.759691,0.687591,-1.227884,-0.912204,-1.124006
3920,0,0.0,1.0,0.0,1.0,1.0,0.0,-0.172801,0.386021,-0.695631,2.184041,-0.912204,0.269924
3983,1,1.0,0.0,0.0,0.0,0.0,1.0,-0.255702,0.481497,-1.041437,-1.227884,-0.912204,1.279647
3944,0,1.0,0.0,0.0,0.0,1.0,1.0,-0.400780,-0.759691,1.033397,0.664825,-0.912204,-0.797236
3279,0,0.0,0.0,1.0,1.0,1.0,0.0,0.013728,-0.186835,0.687591,-1.227884,0.798045,-0.541349
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,0,1.0,0.0,0.0,1.0,1.0,0.0,0.127718,-0.282311,1.725009,-1.227884,0.798045,0.637880
4134,0,1.0,0.0,0.0,1.0,1.0,1.0,0.283158,0.386021,-0.004020,0.413754,-0.912204,-0.351797
4453,0,0.0,1.0,0.0,1.0,0.0,0.0,1.640673,-0.568739,1.033397,1.147123,-0.912204,1.734968
9346,0,1.0,0.0,0.0,1.0,1.0,1.0,0.676941,0.195069,1.379203,-1.227884,-0.912204,0.234991


In [55]:
test_processed_v2 = feature_manager.transform_with_standard_scaler(test_v1)
test_processed_v2

Unnamed: 0,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary
4559,1,0.0,1.0,0.0,0.0,0.0,0.0,-1.271248,2.200065,0.687591,0.908836,-0.912204,1.199189
2531,0,0.0,1.0,0.0,1.0,1.0,1.0,-0.452594,0.767925,-0.349826,1.159484,-0.912204,-0.355271
7504,1,0.0,0.0,1.0,0.0,1.0,1.0,0.034453,-0.664215,-1.041437,-1.227884,-0.912204,-0.834884
4879,1,0.0,0.0,1.0,0.0,1.0,0.0,1.371242,0.481497,-0.695631,0.084482,-0.912204,1.124236
6418,0,0.0,0.0,1.0,0.0,0.0,0.0,1.070724,-1.141595,1.033397,-1.227884,0.798045,0.384292
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997,0,0.0,1.0,0.0,1.0,1.0,0.0,-1.924098,-1.428022,0.341786,1.274673,-0.912204,1.598214
2782,0,1.0,0.0,0.0,1.0,0.0,0.0,-0.048448,-1.428022,1.033397,-1.227884,0.798045,-0.125528
8552,0,1.0,0.0,0.0,0.0,1.0,1.0,-1.893010,-1.618974,-1.387243,-0.003634,-0.912204,1.657817
5562,0,0.0,0.0,1.0,1.0,1.0,1.0,-0.317879,0.290545,1.379203,0.720974,-0.912204,-1.230689


In [56]:
data_manager.save_processed_data(
    train_processed_v2, test_processed_v2, 
    feature_set_name = "v2_one-hot_encoding_plus_normalizacao"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v2_one-hot_encoding_plus_normalizacao


## v3_one-hot_encoding_plus_normalizacao_plus_poly

Partindo de v2, criam-se variáveis numéricas a partir do produto dois a dois das variáveis numéricas iniciais normalizadas.

In [57]:
train_v2, test_v2 = data_manager.load_processed_data(feature_set = "v2_one-hot_encoding_plus_normalizacao")

 Carregando features: v2_one-hot_encoding_plus_normalizacao
 FeatureSet info: 13 features


In [58]:
feature_manager.fit_poly(train_v2)

<feature_manager.FeatureEngineer at 0x126e82ea930>

In [59]:
train_processed_v3 = feature_manager.transform_with_poly(train_v2)
train_processed_v3

Unnamed: 0,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,CreditScore,Age,Tenure,...,Tenure^2,Tenure Balance,Tenure NumOfProducts,Tenure EstimatedSalary,Balance^2,Balance NumOfProducts,Balance EstimatedSalary,NumOfProducts^2,NumOfProducts EstimatedSalary,EstimatedSalary^2
8769,0,1.0,0.0,0.0,1.0,1.0,0.0,-0.297153,-0.759691,0.687591,...,0.472782,-0.844283,-0.627224,-0.772857,1.507699,1.120081,1.380148,0.832116,1.025322,1.263388
3920,0,0.0,1.0,0.0,1.0,1.0,0.0,-0.172801,0.386021,-0.695631,...,0.483903,-1.519288,0.634558,-0.187768,4.770037,-1.992291,0.589525,0.832116,-0.246226,0.072859
3983,1,1.0,0.0,0.0,0.0,0.0,1.0,-0.255702,0.481497,-1.041437,...,1.084591,1.278764,0.950003,-1.332672,1.507699,1.120081,-1.571258,0.832116,-1.167299,1.637496
3944,0,1.0,0.0,0.0,0.0,1.0,1.0,-0.400780,-0.759691,1.033397,...,1.067910,0.687028,-0.942669,-0.823861,0.441992,-0.606456,-0.530022,0.832116,0.727241,0.635585
3279,0,0.0,0.0,1.0,1.0,1.0,0.0,0.013728,-0.186835,0.687591,...,0.472782,-0.844283,0.548729,-0.372227,1.507699,-0.979906,0.664714,0.636876,-0.432021,0.293059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,0,1.0,0.0,0.0,1.0,1.0,0.0,0.127718,-0.282311,1.725009,...,2.975655,-2.118111,1.376634,1.100349,1.507699,-0.979906,-0.783243,0.636876,0.509057,0.406891
4134,0,1.0,0.0,0.0,1.0,1.0,1.0,0.283158,0.386021,-0.004020,...,0.000016,-0.001663,0.003667,0.001414,0.171192,-0.377428,-0.145557,0.832116,0.320911,0.123761
4453,0,0.0,1.0,0.0,1.0,0.0,0.0,1.640673,-0.568739,1.033397,...,1.067910,1.185433,-0.942669,1.792912,1.315890,-1.046410,1.990221,0.832116,-1.582645,3.010115
9346,0,1.0,0.0,0.0,1.0,1.0,1.0,0.676941,0.195069,1.379203,...,1.902201,-1.693501,-1.258114,0.324100,1.507699,1.120081,-0.288541,0.832116,-0.214359,0.055221


In [60]:
test_processed_v3 = feature_manager.transform_with_poly(test_v2)
test_processed_v3

Unnamed: 0,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,CreditScore,Age,Tenure,...,Tenure^2,Tenure Balance,Tenure NumOfProducts,Tenure EstimatedSalary,Balance^2,Balance NumOfProducts,Balance EstimatedSalary,NumOfProducts^2,NumOfProducts EstimatedSalary,EstimatedSalary^2
4559,1,0.0,1.0,0.0,0.0,0.0,0.0,-1.271248,2.200065,0.687591,...,0.472782,0.624908,-0.627224,0.824552,0.825983,-0.829044,1.089866,0.832116,-1.093905,1.438054
2531,0,0.0,1.0,0.0,1.0,1.0,1.0,-0.452594,0.767925,-0.349826,...,0.122378,-0.405617,0.319112,0.124283,1.344402,-1.057685,-0.411931,0.832116,0.324079,0.126217
7504,1,0.0,0.0,1.0,0.0,1.0,1.0,0.034453,-0.664215,-1.041437,...,1.084591,1.278764,0.950003,0.869479,1.507699,1.120081,1.025141,0.832116,0.761585,0.697032
4879,1,0.0,0.0,1.0,0.0,1.0,0.0,1.371242,0.481497,-0.695631,...,0.483903,-0.058768,0.634558,-0.782054,0.007137,-0.077065,0.094978,0.832116,-1.025532,1.263906
6418,0,0.0,0.0,1.0,0.0,0.0,0.0,1.070724,-1.141595,1.033397,...,1.067910,-1.268892,0.824697,0.397126,1.507699,-0.979906,-0.471866,0.636876,0.306682,0.147680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997,0,0.0,1.0,0.0,1.0,1.0,0.0,-1.924098,-1.428022,0.341786,...,0.116817,0.435665,-0.311778,0.546247,1.624790,-1.162761,2.037199,0.832116,-1.457897,2.554288
2782,0,1.0,0.0,0.0,1.0,0.0,0.0,-0.048448,-1.428022,1.033397,...,1.067910,-1.268892,0.824697,-0.129720,1.507699,-0.979906,0.154134,0.636876,-0.100177,0.015757
8552,0,1.0,0.0,0.0,0.0,1.0,1.0,-1.893010,-1.618974,-1.387243,...,1.924443,0.005041,1.265448,-2.299795,0.000013,0.003315,-0.006024,0.832116,-1.512267,2.748358
5562,0,0.0,0.0,1.0,1.0,1.0,1.0,-0.317879,0.290545,1.379203,...,1.902201,0.994370,-1.258114,-1.697370,0.519804,-0.657676,-0.887295,0.832116,1.122639,1.514595


In [61]:
data_manager.save_processed_data(
    train_processed_v3, test_processed_v3, 
    feature_set_name = "v3_one-hot_encoding_plus_normalizacao_plus_poly"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v3_one-hot_encoding_plus_normalizacao_plus_poly


## v4_normalizacao_plus_poly

Partindo de v0, criam-se variáveis numéricas a partir do produto dois a dois das variáveis numéricas iniciais normalizadas.

Parte-se de v0 para manter as variáveis categóricas sem one-hot encoding para o catboost.

In [66]:
train_v0, test_v0 = data_manager.load_processed_data(feature_set = "v0_basico")

 Carregando features: v0_basico
 FeatureSet info: 11 features


In [67]:
feature_manager.fit_standard_scaler(train_v0)

<feature_manager.FeatureEngineer at 0x126e82ea930>

In [68]:
train_processed_v4_temp = feature_manager.transform_with_standard_scaler(train_v0)
train_processed_v4_temp

Unnamed: 0,Geography,Gender,HasCrCard,IsActiveMember,Exited,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary
8769,France,Male,1,0,0,-0.297153,-0.759691,0.687591,-1.227884,-0.912204,-1.124006
3920,Germany,Male,1,0,0,-0.172801,0.386021,-0.695631,2.184041,-0.912204,0.269924
3983,France,Female,0,1,1,-0.255702,0.481497,-1.041437,-1.227884,-0.912204,1.279647
3944,France,Female,1,1,0,-0.400780,-0.759691,1.033397,0.664825,-0.912204,-0.797236
3279,Spain,Male,1,0,0,0.013728,-0.186835,0.687591,-1.227884,0.798045,-0.541349
...,...,...,...,...,...,...,...,...,...,...,...
3218,France,Male,1,0,0,0.127718,-0.282311,1.725009,-1.227884,0.798045,0.637880
4134,France,Male,1,1,0,0.283158,0.386021,-0.004020,0.413754,-0.912204,-0.351797
4453,Germany,Male,0,0,0,1.640673,-0.568739,1.033397,1.147123,-0.912204,1.734968
9346,France,Male,1,1,0,0.676941,0.195069,1.379203,-1.227884,-0.912204,0.234991


In [69]:
test_processed_v4_temp = feature_manager.transform_with_standard_scaler(test_v0)
test_processed_v4_temp

Unnamed: 0,Geography,Gender,HasCrCard,IsActiveMember,Exited,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary
4559,Germany,Female,0,0,1,-1.271248,2.200065,0.687591,0.908836,-0.912204,1.199189
2531,Germany,Male,1,1,0,-0.452594,0.767925,-0.349826,1.159484,-0.912204,-0.355271
7504,Spain,Female,1,1,1,0.034453,-0.664215,-1.041437,-1.227884,-0.912204,-0.834884
4879,Spain,Female,1,0,1,1.371242,0.481497,-0.695631,0.084482,-0.912204,1.124236
6418,Spain,Female,0,0,0,1.070724,-1.141595,1.033397,-1.227884,0.798045,0.384292
...,...,...,...,...,...,...,...,...,...,...,...
7997,Germany,Male,1,0,0,-1.924098,-1.428022,0.341786,1.274673,-0.912204,1.598214
2782,France,Male,0,0,0,-0.048448,-1.428022,1.033397,-1.227884,0.798045,-0.125528
8552,France,Female,1,1,0,-1.893010,-1.618974,-1.387243,-0.003634,-0.912204,1.657817
5562,Spain,Male,1,1,0,-0.317879,0.290545,1.379203,0.720974,-0.912204,-1.230689


In [70]:
feature_manager.fit_poly(train_processed_v4_temp)

<feature_manager.FeatureEngineer at 0x126e82ea930>

In [74]:
train_processed_v4 = feature_manager.transform_with_poly(train_processed_v4_temp)
train_processed_v4

Unnamed: 0,Geography,Gender,HasCrCard,IsActiveMember,Exited,CreditScore,Age,Tenure,Balance,NumOfProducts,...,Tenure^2,Tenure Balance,Tenure NumOfProducts,Tenure EstimatedSalary,Balance^2,Balance NumOfProducts,Balance EstimatedSalary,NumOfProducts^2,NumOfProducts EstimatedSalary,EstimatedSalary^2
8769,France,Male,1,0,0,-0.297153,-0.759691,0.687591,-1.227884,-0.912204,...,0.472782,-0.844283,-0.627224,-0.772857,1.507699,1.120081,1.380148,0.832116,1.025322,1.263388
3920,Germany,Male,1,0,0,-0.172801,0.386021,-0.695631,2.184041,-0.912204,...,0.483903,-1.519288,0.634558,-0.187768,4.770037,-1.992291,0.589525,0.832116,-0.246226,0.072859
3983,France,Female,0,1,1,-0.255702,0.481497,-1.041437,-1.227884,-0.912204,...,1.084591,1.278764,0.950003,-1.332672,1.507699,1.120081,-1.571258,0.832116,-1.167299,1.637496
3944,France,Female,1,1,0,-0.400780,-0.759691,1.033397,0.664825,-0.912204,...,1.067910,0.687028,-0.942669,-0.823861,0.441992,-0.606456,-0.530022,0.832116,0.727241,0.635585
3279,Spain,Male,1,0,0,0.013728,-0.186835,0.687591,-1.227884,0.798045,...,0.472782,-0.844283,0.548729,-0.372227,1.507699,-0.979906,0.664714,0.636876,-0.432021,0.293059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,France,Male,1,0,0,0.127718,-0.282311,1.725009,-1.227884,0.798045,...,2.975655,-2.118111,1.376634,1.100349,1.507699,-0.979906,-0.783243,0.636876,0.509057,0.406891
4134,France,Male,1,1,0,0.283158,0.386021,-0.004020,0.413754,-0.912204,...,0.000016,-0.001663,0.003667,0.001414,0.171192,-0.377428,-0.145557,0.832116,0.320911,0.123761
4453,Germany,Male,0,0,0,1.640673,-0.568739,1.033397,1.147123,-0.912204,...,1.067910,1.185433,-0.942669,1.792912,1.315890,-1.046410,1.990221,0.832116,-1.582645,3.010115
9346,France,Male,1,1,0,0.676941,0.195069,1.379203,-1.227884,-0.912204,...,1.902201,-1.693501,-1.258114,0.324100,1.507699,1.120081,-0.288541,0.832116,-0.214359,0.055221


In [75]:
test_processed_v4 = feature_manager.transform_with_poly(test_processed_v4_temp)
test_processed_v4

Unnamed: 0,Geography,Gender,HasCrCard,IsActiveMember,Exited,CreditScore,Age,Tenure,Balance,NumOfProducts,...,Tenure^2,Tenure Balance,Tenure NumOfProducts,Tenure EstimatedSalary,Balance^2,Balance NumOfProducts,Balance EstimatedSalary,NumOfProducts^2,NumOfProducts EstimatedSalary,EstimatedSalary^2
4559,Germany,Female,0,0,1,-1.271248,2.200065,0.687591,0.908836,-0.912204,...,0.472782,0.624908,-0.627224,0.824552,0.825983,-0.829044,1.089866,0.832116,-1.093905,1.438054
2531,Germany,Male,1,1,0,-0.452594,0.767925,-0.349826,1.159484,-0.912204,...,0.122378,-0.405617,0.319112,0.124283,1.344402,-1.057685,-0.411931,0.832116,0.324079,0.126217
7504,Spain,Female,1,1,1,0.034453,-0.664215,-1.041437,-1.227884,-0.912204,...,1.084591,1.278764,0.950003,0.869479,1.507699,1.120081,1.025141,0.832116,0.761585,0.697032
4879,Spain,Female,1,0,1,1.371242,0.481497,-0.695631,0.084482,-0.912204,...,0.483903,-0.058768,0.634558,-0.782054,0.007137,-0.077065,0.094978,0.832116,-1.025532,1.263906
6418,Spain,Female,0,0,0,1.070724,-1.141595,1.033397,-1.227884,0.798045,...,1.067910,-1.268892,0.824697,0.397126,1.507699,-0.979906,-0.471866,0.636876,0.306682,0.147680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997,Germany,Male,1,0,0,-1.924098,-1.428022,0.341786,1.274673,-0.912204,...,0.116817,0.435665,-0.311778,0.546247,1.624790,-1.162761,2.037199,0.832116,-1.457897,2.554288
2782,France,Male,0,0,0,-0.048448,-1.428022,1.033397,-1.227884,0.798045,...,1.067910,-1.268892,0.824697,-0.129720,1.507699,-0.979906,0.154134,0.636876,-0.100177,0.015757
8552,France,Female,1,1,0,-1.893010,-1.618974,-1.387243,-0.003634,-0.912204,...,1.924443,0.005041,1.265448,-2.299795,0.000013,0.003315,-0.006024,0.832116,-1.512267,2.748358
5562,Spain,Male,1,1,0,-0.317879,0.290545,1.379203,0.720974,-0.912204,...,1.902201,0.994370,-1.258114,-1.697370,0.519804,-0.657676,-0.887295,0.832116,1.122639,1.514595


In [76]:
data_manager.save_processed_data(
    train_processed_v4, test_processed_v4, 
    feature_set_name = "v4_normalizacao_plus_poly"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v4_normalizacao_plus_poly


## v5_one-hot_encoding_plus_poly

Partindo de v1, criam-se variáveis numéricas a partir do produto dois a dois das variáveis numéricas iniciais (sem normalização).

In [3]:
train_v1, test_v1 = data_manager.load_processed_data(feature_set = "v1_one-hot_encoding")

 Carregando features: v1_one-hot_encoding
 FeatureSet info: 13 features


In [4]:
feature_manager.fit_poly(train_v1)

<feature_manager.FeatureEngineer at 0x1d2a9cb8380>

In [5]:
train_processed_v5 = feature_manager.transform_with_poly(train_v1)
train_processed_v5

Unnamed: 0,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,CreditScore,Age,Tenure,...,Tenure^2,Tenure Balance,Tenure NumOfProducts,Tenure EstimatedSalary,Balance^2,Balance NumOfProducts,Balance EstimatedSalary,NumOfProducts^2,NumOfProducts EstimatedSalary,EstimatedSalary^2
8769,0,1.0,0.0,0.0,1.0,1.0,0.0,622.0,31.0,7.0,...,49.0,0.00,7.0,247861.39,0.000000e+00,0.00,0.000000e+00,1.0,35408.77,1.253781e+09
3920,0,0.0,1.0,0.0,1.0,1.0,0.0,634.0,43.0,3.0,...,9.0,638088.96,3.0,345806.58,4.523972e+10,212696.32,2.451726e+10,1.0,115268.86,1.328691e+10
3983,1,1.0,0.0,0.0,0.0,0.0,1.0,626.0,44.0,2.0,...,4.0,0.00,2.0,346234.44,0.000000e+00,0.00,0.000000e+00,1.0,173117.22,2.996957e+10
3944,0,1.0,0.0,0.0,0.0,1.0,1.0,612.0,31.0,8.0,...,64.0,943918.08,8.0,433038.88,1.392158e+10,117989.76,6.386769e+09,1.0,54129.86,2.930042e+09
3279,0,0.0,0.0,1.0,1.0,1.0,0.0,652.0,37.0,7.0,...,49.0,0.00,14.0,481529.51,0.000000e+00,0.00,0.000000e+00,4.0,137579.86,4.732054e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,0,1.0,0.0,0.0,1.0,1.0,0.0,663.0,36.0,10.0,...,100.0,0.00,20.0,1363495.50,0.000000e+00,0.00,0.000000e+00,4.0,272699.10,1.859120e+10
4134,0,1.0,0.0,0.0,1.0,1.0,1.0,678.0,43.0,5.0,...,25.0,511690.95,5.0,398248.10,1.047311e+10,102338.19,8.151198e+09,1.0,79649.62,6.344062e+09
4453,0,0.0,1.0,0.0,1.0,0.0,0.0,809.0,33.0,8.0,...,64.0,1184445.92,8.0,1593625.68,2.192050e+10,148055.74,2.949318e+10,1.0,199203.21,3.968192e+10
9346,0,1.0,0.0,0.0,1.0,1.0,1.0,716.0,41.0,9.0,...,81.0,0.00,9.0,1019407.32,0.000000e+00,0.00,0.000000e+00,1.0,113267.48,1.282952e+10


In [7]:
test_processed_v5 = feature_manager.transform_with_poly(test_v1)
test_processed_v5

Unnamed: 0,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,CreditScore,Age,Tenure,...,Tenure^2,Tenure Balance,Tenure NumOfProducts,Tenure EstimatedSalary,Balance^2,Balance NumOfProducts,Balance EstimatedSalary,NumOfProducts^2,NumOfProducts EstimatedSalary,EstimatedSalary^2
4559,1,0.0,1.0,0.0,0.0,0.0,0.0,528.0,62.0,7.0,...,49.0,932408.19,7.0,1179553.76,1.774255e+10,133201.17,2.244542e+10,1.0,168507.68,2.839484e+10
2531,0,0.0,1.0,0.0,1.0,1.0,1.0,607.0,47.0,4.0,...,16.0,595305.28,4.0,317802.44,2.214927e+10,148826.32,1.182434e+10,1.0,79450.61,6.312399e+09
7504,1,0.0,0.0,1.0,0.0,1.0,1.0,654.0,32.0,2.0,...,4.0,0.00,2.0,103945.84,0.000000e+00,0.00,0.000000e+00,1.0,51972.92,2.701184e+09
4879,1,0.0,0.0,1.0,0.0,1.0,0.0,783.0,44.0,3.0,...,9.0,245435.13,3.0,492640.59,6.693156e+09,81811.71,1.343459e+10,1.0,164213.53,2.696608e+10
6418,0,0.0,0.0,1.0,0.0,0.0,0.0,754.0,27.0,8.0,...,64.0,0.00,16.0,974569.28,0.000000e+00,0.00,0.000000e+00,4.0,243642.32,1.484040e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997,0,0.0,1.0,0.0,1.0,1.0,0.0,465.0,24.0,6.0,...,36.0,936042.54,6.0,1148210.22,2.433821e+10,156007.09,2.985482e+10,1.0,191368.37,3.662185e+10
2782,0,1.0,0.0,0.0,1.0,0.0,0.0,646.0,24.0,8.0,...,64.0,0.00,16.0,740903.04,0.000000e+00,0.00,0.000000e+00,4.0,185225.76,8.577146e+09
8552,0,1.0,0.0,0.0,0.0,1.0,1.0,468.0,22.0,1.0,...,1.0,76318.64,1.0,194783.12,5.824535e+09,76318.64,1.486558e+10,1.0,194783.12,3.794046e+10
5562,0,0.0,0.0,1.0,1.0,1.0,1.0,620.0,42.0,9.0,...,81.0,1093410.45,9.0,263670.66,1.475983e+10,121490.05,3.559262e+09,1.0,29296.74,8.582990e+08


In [8]:
data_manager.save_processed_data(
    train_processed_v5, test_processed_v5, 
    feature_set_name = "v5_one-hot_encoding_plus_poly"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v5_one-hot_encoding_plus_poly


## v6_one-hot_encoding_plus_top3_poly

Partindo de v1, criam-se variáveis numéricas a partir do produto dois a dois das variáveis das 3 variáveis numéricas iniciais (sem normalização) mais importantes (com base feature_importances_ dos 5} modelos sobre v0 e v1).

In [4]:
cat_opt_fv0_model = utils.load_model('cat_opt_fv0')
cat_opt_fv0_model

<catboost.core.CatBoostClassifier at 0x265f854df40>

In [5]:
xgb_opt_fv1_model = utils.load_model('xgb_opt_fv1')
xgb_opt_fv1_model

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6364698221789004
,device,
,early_stopping_rounds,
,enable_categorical,False


In [6]:
lgb_opt_fv1_model = utils.load_model('lgb_opt_fv1')
lgb_opt_fv1_model

0,1,2
,boosting_type,'gbdt'
,num_leaves,235
,max_depth,4
,learning_rate,0.018908840487666216
,n_estimators,2795
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.004628418664177039


In [7]:
rf_opt_fv1_model = utils.load_model('rf_opt_fv1')
rf_opt_fv1_model

0,1,2
,n_estimators,1339
,criterion,'gini'
,max_depth,9
,min_samples_split,23
,min_samples_leaf,6
,min_weight_fraction_leaf,0.0
,max_features,0.48726312627554236
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
ngb_opt_fv1_model = utils.load_model('ngb_opt_fv1')
ngb_opt_fv1_model

0,1,2
,Dist,<class 'ngboo....Categorical'>
,Score,<class 'ngboo...res.LogScore'>
,Base,DecisionTreeR...om_state=1234)
,natural_gradient,True
,n_estimators,919
,learning_rate,0.0019391638801433634
,minibatch_frac,0.5585780611073167
,col_sample,1.0
,verbose,True
,random_state,RandomState(M... 0x2658AA19640

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,6
,min_samples_split,42
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,0.6296122913657486
,random_state,1234
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [30]:
NUMERICAL_FEATURES

['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [44]:
cat_fv0_importances_numerical = cat_opt_fv0_model.get_feature_importance(prettified=True).rename(columns = {'Feature Id': "feature", "Importances": "importance"}).query("`feature` in @NUMERICAL_FEATURES")
cat_fv0_importances_numerical

Unnamed: 0,feature,importance
0,NumOfProducts,29.38364
1,Age,22.130798
2,Balance,13.423836
5,EstimatedSalary,6.736258
6,CreditScore,5.974121
7,Tenure,2.671533


In [43]:
xgb_fv1_importances_numerical = pd.DataFrame({
    'feature': xgb_opt_fv1_model.get_booster().feature_names,
    'importance': xgb_opt_fv1_model.feature_importances_
}).sort_values('importance', ascending=False).query("`feature` in @NUMERICAL_FEATURES")
xgb_fv1_importances_numerical

Unnamed: 0,feature,importance
4,NumOfProducts,0.208403
1,Age,0.171509
3,Balance,0.051798
5,EstimatedSalary,0.036866
0,CreditScore,0.03683
2,Tenure,0.035324


In [49]:
lgb_fv1_importances_numerical = pd.DataFrame({
    'feature': lgb_opt_fv1_model.booster_.feature_name(),
    'importance': lgb_opt_fv1_model.feature_importances_/lgb_opt_fv1_model.feature_importances_.sum()
}).sort_values('importance', ascending=False).query("`feature` in @NUMERICAL_FEATURES")
lgb_fv1_importances_numerical

Unnamed: 0,feature,importance
5,EstimatedSalary,0.202162
0,CreditScore,0.194249
3,Balance,0.180665
1,Age,0.161474
2,Tenure,0.08312
4,NumOfProducts,0.042993


In [52]:
train_v1, test_v1 = data_manager.load_processed_data(feature_set = "v1_one-hot_encoding")
train_X_v1, train_y_v1 = data_manager.split_features_target(train_v1)

rf_fv1_importances_numerical = pd.DataFrame({
    'feature': train_X_v1.columns,
    'importance': rf_opt_fv1_model.feature_importances_
}).sort_values('importance', ascending=False).query("`feature` in @NUMERICAL_FEATURES")
rf_fv1_importances_numerical

 Carregando features: v1_one-hot_encoding
 FeatureSet info: 13 features
 Features: (8000, 12), Target: (8000,)


Unnamed: 0,feature,importance
1,Age,0.34442
4,NumOfProducts,0.284429
3,Balance,0.097768
0,CreditScore,0.04376
5,EstimatedSalary,0.043681
2,Tenure,0.022488


In [53]:
feature_manager_select = feat_mgr.FeatureEngineer()
feature_manager_select.numerical_features = ['Age', 'NumOfProducts', 'Balance']

In [54]:
train_v1, test_v1 = data_manager.load_processed_data(feature_set = "v1_one-hot_encoding")

 Carregando features: v1_one-hot_encoding
 FeatureSet info: 13 features


In [55]:
feature_manager_select.fit_poly(train_v1)

<feature_manager.FeatureEngineer at 0x26595536870>

In [62]:
train_processed_v6 = feature_manager_select.transform_with_poly(train_v1).drop(columns = ['Age', 'NumOfProducts', 'Balance'])
train_processed_v6

Unnamed: 0,CreditScore,Tenure,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,Age^2,Age NumOfProducts,Age Balance,NumOfProducts^2,NumOfProducts Balance,Balance^2
8769,622,7,35408.77,0,1.0,0.0,0.0,1.0,1.0,0.0,961.0,31.0,0.00,1.0,0.00,0.000000e+00
3920,634,3,115268.86,0,0.0,1.0,0.0,1.0,1.0,0.0,1849.0,43.0,9145941.76,1.0,212696.32,4.523972e+10
3983,626,2,173117.22,1,1.0,0.0,0.0,0.0,0.0,1.0,1936.0,44.0,0.00,1.0,0.00,0.000000e+00
3944,612,8,54129.86,0,1.0,0.0,0.0,0.0,1.0,1.0,961.0,31.0,3657682.56,1.0,117989.76,1.392158e+10
3279,652,7,68789.93,0,0.0,0.0,1.0,1.0,1.0,0.0,1369.0,74.0,0.00,4.0,0.00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,663,10,136349.55,0,1.0,0.0,0.0,1.0,1.0,0.0,1296.0,72.0,0.00,4.0,0.00,0.000000e+00
4134,678,5,79649.62,0,1.0,0.0,0.0,1.0,1.0,1.0,1849.0,43.0,4400542.17,1.0,102338.19,1.047311e+10
4453,809,8,199203.21,0,0.0,1.0,0.0,1.0,0.0,0.0,1089.0,33.0,4885839.42,1.0,148055.74,2.192050e+10
9346,716,9,113267.48,0,1.0,0.0,0.0,1.0,1.0,1.0,1681.0,41.0,0.00,1.0,0.00,0.000000e+00


In [63]:
test_processed_v6 = feature_manager_select.transform_with_poly(test_v1).drop(columns = ['Age', 'NumOfProducts', 'Balance'])
test_processed_v6

Unnamed: 0,CreditScore,Tenure,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,Age^2,Age NumOfProducts,Age Balance,NumOfProducts^2,NumOfProducts Balance,Balance^2
4559,528,7,168507.68,1,0.0,1.0,0.0,0.0,0.0,0.0,3844.0,62.0,8258472.54,1.0,133201.17,1.774255e+10
2531,607,4,79450.61,0,0.0,1.0,0.0,1.0,1.0,1.0,2209.0,47.0,6994837.04,1.0,148826.32,2.214927e+10
7504,654,2,51972.92,1,0.0,0.0,1.0,0.0,1.0,1.0,1024.0,32.0,0.00,1.0,0.00,0.000000e+00
4879,783,3,164213.53,1,0.0,0.0,1.0,0.0,1.0,0.0,1936.0,44.0,3599715.24,1.0,81811.71,6.693156e+09
6418,754,8,121821.16,0,0.0,0.0,1.0,0.0,0.0,0.0,729.0,54.0,0.00,4.0,0.00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997,465,6,191368.37,0,0.0,1.0,0.0,1.0,1.0,0.0,576.0,24.0,3744170.16,1.0,156007.09,2.433821e+10
2782,646,8,92612.88,0,1.0,0.0,0.0,1.0,0.0,0.0,576.0,48.0,0.00,4.0,0.00,0.000000e+00
8552,468,1,194783.12,0,1.0,0.0,0.0,0.0,1.0,1.0,484.0,22.0,1679010.08,1.0,76318.64,5.824535e+09
5562,620,9,29296.74,0,0.0,0.0,1.0,1.0,1.0,1.0,1764.0,42.0,5102582.10,1.0,121490.05,1.475983e+10


In [64]:
data_manager.save_processed_data(
    train_processed_v6, test_processed_v6, 
    feature_set_name = "v6_one-hot_encoding_plus_top3_poly"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v6_one-hot_encoding_plus_top3_poly


## v7_plus_top3_poly
Partindo de v0, criam-se variáveis numéricas a partir do produto dois a dois das variáveis das 3 variáveis numéricas iniciais (sem normalização) mais importantes (com base feature_importances_ dos 5 modelos sobre v0 e v1).

In [65]:
feature_manager_select = feat_mgr.FeatureEngineer()
feature_manager_select.numerical_features = ['Age', 'NumOfProducts', 'Balance']

In [66]:
train_v0, test_v0 = data_manager.load_processed_data(feature_set = "v0_basico")

 Carregando features: v0_basico
 FeatureSet info: 11 features


In [67]:
feature_manager_select.fit_poly(train_v0)

<feature_manager.FeatureEngineer at 0x26595d56c90>

In [68]:
train_processed_v7 = feature_manager_select.transform_with_poly(train_v0).drop(columns = ['Age', 'NumOfProducts', 'Balance'])
train_processed_v7

Unnamed: 0,CreditScore,Geography,Gender,Tenure,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Age^2,Age NumOfProducts,Age Balance,NumOfProducts^2,NumOfProducts Balance,Balance^2
8769,622,France,Male,7,1,0,35408.77,0,961.0,31.0,0.00,1.0,0.00,0.000000e+00
3920,634,Germany,Male,3,1,0,115268.86,0,1849.0,43.0,9145941.76,1.0,212696.32,4.523972e+10
3983,626,France,Female,2,0,1,173117.22,1,1936.0,44.0,0.00,1.0,0.00,0.000000e+00
3944,612,France,Female,8,1,1,54129.86,0,961.0,31.0,3657682.56,1.0,117989.76,1.392158e+10
3279,652,Spain,Male,7,1,0,68789.93,0,1369.0,74.0,0.00,4.0,0.00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,663,France,Male,10,1,0,136349.55,0,1296.0,72.0,0.00,4.0,0.00,0.000000e+00
4134,678,France,Male,5,1,1,79649.62,0,1849.0,43.0,4400542.17,1.0,102338.19,1.047311e+10
4453,809,Germany,Male,8,0,0,199203.21,0,1089.0,33.0,4885839.42,1.0,148055.74,2.192050e+10
9346,716,France,Male,9,1,1,113267.48,0,1681.0,41.0,0.00,1.0,0.00,0.000000e+00


In [69]:
test_processed_v7 = feature_manager_select.transform_with_poly(test_v0).drop(columns = ['Age', 'NumOfProducts', 'Balance'])
test_processed_v7

Unnamed: 0,CreditScore,Geography,Gender,Tenure,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Age^2,Age NumOfProducts,Age Balance,NumOfProducts^2,NumOfProducts Balance,Balance^2
4559,528,Germany,Female,7,0,0,168507.68,1,3844.0,62.0,8258472.54,1.0,133201.17,1.774255e+10
2531,607,Germany,Male,4,1,1,79450.61,0,2209.0,47.0,6994837.04,1.0,148826.32,2.214927e+10
7504,654,Spain,Female,2,1,1,51972.92,1,1024.0,32.0,0.00,1.0,0.00,0.000000e+00
4879,783,Spain,Female,3,1,0,164213.53,1,1936.0,44.0,3599715.24,1.0,81811.71,6.693156e+09
6418,754,Spain,Female,8,0,0,121821.16,0,729.0,54.0,0.00,4.0,0.00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997,465,Germany,Male,6,1,0,191368.37,0,576.0,24.0,3744170.16,1.0,156007.09,2.433821e+10
2782,646,France,Male,8,0,0,92612.88,0,576.0,48.0,0.00,4.0,0.00,0.000000e+00
8552,468,France,Female,1,1,1,194783.12,0,484.0,22.0,1679010.08,1.0,76318.64,5.824535e+09
5562,620,Spain,Male,9,1,1,29296.74,0,1764.0,42.0,5102582.10,1.0,121490.05,1.475983e+10


In [70]:
data_manager.save_processed_data(
    train_processed_v7, test_processed_v7, 
    feature_set_name = "v7_plus_top3_poly"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v7_plus_top3_poly


## v8_numerical_to_categorical

Partindo de v0, criam-se variáveis variáveis categóricas a partir das variáveis numéricas.

In [89]:
bins_dict = {
    'CreditScore': [-100, 500, 600, 700, 800, 1200],
    'Age': [-100, 25, 35, 45, 60, 120],
    'Tenure': [-100, 2, 4, 6, 8, 10],
    'Balance': [-100, 50000, 100000, 150000, 300000],
    'NumOfProducts': [-100, 1, 2, 3, 5],
    'EstimatedSalary': [-100, 25000, 50000, 100000, 150000, 200000]    
}

labels_dict = {
    'CreditScore': ['(-500]', '(500-600]', '(600-700]', '(700-800]', '(800-)'],
    'Age': ['(-25]', '(25-35]', '(35-45]', '(45-60]', '(60-)'],
    'Tenure': ['(-2]', '(2-4]', '(4-6]', '(6-8]', '(8-]'],
    'Balance': ['(-50.000]', '(50.000-100.000]', '(100.000-150.000]', '(150.000-)'],
    'NumOfProducts': ['(-1]', '(1-2]', '(2-3]', '(3-)'],
    'EstimatedSalary': ['(-25.000]', '(25.000-50.000]', '(50.000-100.000]', '(100.000-150.000]', '(150.000-]']
}

In [90]:
def categorizar_variaveis(df, colunas, bins_dict, labels_dict):
    df_categorizado = df.copy()
    
    for coluna in colunas:
        df_categorizado[f'{coluna}_cat'] = pd.cut(
            df[coluna],
            bins=bins_dict[coluna],
            labels=labels_dict[coluna]
        )
    df_categorizado = df_categorizado.drop(columns = colunas)
    return df_categorizado

In [91]:
train_v0, test_v0 = data_manager.load_processed_data(feature_set = "v0_basico")

 Carregando features: v0_basico
 FeatureSet info: 11 features


In [88]:
train_v0

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
8769,622,France,Male,31,7,0.00,1,1,0,35408.77,0
3920,634,Germany,Male,43,3,212696.32,1,1,0,115268.86,0
3983,626,France,Female,44,2,0.00,1,0,1,173117.22,1
3944,612,France,Female,31,8,117989.76,1,1,1,54129.86,0
3279,652,Spain,Male,37,7,0.00,2,1,0,68789.93,0
...,...,...,...,...,...,...,...,...,...,...,...
3218,663,France,Male,36,10,0.00,2,1,0,136349.55,0
4134,678,France,Male,43,5,102338.19,1,1,1,79649.62,0
4453,809,Germany,Male,33,8,148055.74,1,0,0,199203.21,0
9346,716,France,Male,41,9,0.00,1,1,1,113267.48,0


In [93]:
train_processed_v8 = categorizar_variaveis(train_v0, NUMERICAL_FEATURES, bins_dict, labels_dict)
train_processed_v8

Unnamed: 0,Geography,Gender,HasCrCard,IsActiveMember,Exited,CreditScore_cat,Age_cat,Tenure_cat,Balance_cat,NumOfProducts_cat,EstimatedSalary_cat
8769,France,Male,1,0,0,(600-700],(25-35],(6-8],(-50.000],(-1],(25.000-50.000]
3920,Germany,Male,1,0,0,(600-700],(35-45],(2-4],(150.000-),(-1],(100.000-150.000]
3983,France,Female,0,1,1,(600-700],(35-45],(-2],(-50.000],(-1],(150.000-]
3944,France,Female,1,1,0,(600-700],(25-35],(6-8],(100.000-150.000],(-1],(50.000-100.000]
3279,Spain,Male,1,0,0,(600-700],(35-45],(6-8],(-50.000],(1-2],(50.000-100.000]
...,...,...,...,...,...,...,...,...,...,...,...
3218,France,Male,1,0,0,(600-700],(35-45],(8-],(-50.000],(1-2],(100.000-150.000]
4134,France,Male,1,1,0,(600-700],(35-45],(4-6],(100.000-150.000],(-1],(50.000-100.000]
4453,Germany,Male,0,0,0,(800-),(25-35],(6-8],(100.000-150.000],(-1],(150.000-]
9346,France,Male,1,1,0,(700-800],(35-45],(8-],(-50.000],(-1],(100.000-150.000]


In [94]:
test_processed_v8 = categorizar_variaveis(test_v0, NUMERICAL_FEATURES, bins_dict, labels_dict)
test_processed_v8

Unnamed: 0,Geography,Gender,HasCrCard,IsActiveMember,Exited,CreditScore_cat,Age_cat,Tenure_cat,Balance_cat,NumOfProducts_cat,EstimatedSalary_cat
4559,Germany,Female,0,0,1,(500-600],(60-),(6-8],(100.000-150.000],(-1],(150.000-]
2531,Germany,Male,1,1,0,(600-700],(45-60],(2-4],(100.000-150.000],(-1],(50.000-100.000]
7504,Spain,Female,1,1,1,(600-700],(25-35],(-2],(-50.000],(-1],(50.000-100.000]
4879,Spain,Female,1,0,1,(700-800],(35-45],(2-4],(50.000-100.000],(-1],(150.000-]
6418,Spain,Female,0,0,0,(700-800],(25-35],(6-8],(-50.000],(1-2],(100.000-150.000]
...,...,...,...,...,...,...,...,...,...,...,...
7997,Germany,Male,1,0,0,(-500],(-25],(4-6],(150.000-),(-1],(150.000-]
2782,France,Male,0,0,0,(600-700],(-25],(6-8],(-50.000],(1-2],(50.000-100.000]
8552,France,Female,1,1,0,(-500],(-25],(-2],(50.000-100.000],(-1],(150.000-]
5562,Spain,Male,1,1,0,(600-700],(35-45],(8-],(100.000-150.000],(-1],(25.000-50.000]


In [97]:
data_manager.save_processed_data(
    train_processed_v8, test_processed_v8, 
    feature_set_name = "v8_numerical_to_categorical"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v8_numerical_to_categorical


## v9_numerical_to_categorical_plus_one_hot-encoding

Partindo de v8, aplica-se one hot-encondig sobre todas as variáveis

In [98]:
train_v8, test_v8 = data_manager.load_processed_data(feature_set = "v8_numerical_to_categorical")

 Carregando features: v8_numerical_to_categorical
 FeatureSet info: 11 features


In [101]:
novas_categoricas = list(train_v8.drop(columns = ['Exited']).columns)
novas_categoricas

['Geography',
 'Gender',
 'HasCrCard',
 'IsActiveMember',
 'CreditScore_cat',
 'Age_cat',
 'Tenure_cat',
 'Balance_cat',
 'NumOfProducts_cat',
 'EstimatedSalary_cat']

In [102]:
feature_manager = feat_mgr.FeatureEngineer()
feature_manager.categorical_features = novas_categoricas

In [103]:
feature_manager.fit_onehot_encoder(train_v8)

<feature_manager.FeatureEngineer at 0x26595fe4dd0>

In [108]:
train_processed_v9 = feature_manager.transform_with_onehot(train_v8)
train_processed_v9.columns = train_processed_v9.columns.str.replace(r'[\[\]\(\)]', '', regex=True)
train_processed_v9

Unnamed: 0,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,CreditScore_cat_-500,CreditScore_cat_500-600,CreditScore_cat_600-700,...,Balance_cat_50.000-100.000,NumOfProducts_cat_-1,NumOfProducts_cat_1-2,NumOfProducts_cat_2-3,NumOfProducts_cat_3-,EstimatedSalary_cat_-25.000,EstimatedSalary_cat_100.000-150.000,EstimatedSalary_cat_150.000-,EstimatedSalary_cat_25.000-50.000,EstimatedSalary_cat_50.000-100.000
8769,0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3920,0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3983,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3944,0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3279,0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4134,0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4453,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9346,0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [109]:
test_processed_v9 = feature_manager.transform_with_onehot(test_v8)
test_processed_v9.columns = test_processed_v9.columns.str.replace(r'[\[\]\(\)]', '', regex=True)
test_processed_v9

Unnamed: 0,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1,IsActiveMember_1,CreditScore_cat_-500,CreditScore_cat_500-600,CreditScore_cat_600-700,...,Balance_cat_50.000-100.000,NumOfProducts_cat_-1,NumOfProducts_cat_1-2,NumOfProducts_cat_2-3,NumOfProducts_cat_3-,EstimatedSalary_cat_-25.000,EstimatedSalary_cat_100.000-150.000,EstimatedSalary_cat_150.000-,EstimatedSalary_cat_25.000-50.000,EstimatedSalary_cat_50.000-100.000
4559,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2531,0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7504,1,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4879,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6418,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997,0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2782,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8552,0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5562,0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [110]:
data_manager.save_processed_data(
    train_processed_v9, test_processed_v9, 
    feature_set_name = "v9_numerical_to_categorical_plus_one_hot-encoding"
)

 Dados salvos em: D:\mba\Data Science e Analytics (USP-Esalq)\99 - TCC\Projeto\notebooks\..\data\processed\v9_numerical_to_categorical_plus_one_hot-encoding
