# Desafio 05 - Filtering discriminitory variables 

# Importing Libs

In [1]:
# Libs for POC evaluation

# Basic libs
import numpy as np 
import pandas as pd 

from pandas import set_option

# Statistics
from scipy import stats

# Measure variance
from sklearn.preprocessing import MinMaxScaler

# dividing data set
from sklearn.model_selection import train_test_split

# model for testing 
from sklearn.linear_model import LogisticRegression # logistic Regression Classifier
from sklearn.ensemble import RandomForestRegressor  # feature importance for boruta and RFE

# Filtering engineering
from boruta import BorutaPy
from sklearn.feature_selection import RFECV, f_classif, chi2

# Regression metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

# outliers detection
from sklearn.ensemble import IsolationForest

# show all available columns
pd.set_option('display.max_columns', 500)
# show all available rows
pd.set_option('display.max_rows', 500)

# Avoiding error exhibitions
import warnings
warnings.filterwarnings('ignore')

# Charging and dividing DataSet

In [2]:
# Getting data from cleanead csv

file = 'D:\DataScience\Python\Jupyter\Desafio05\Data\Cleaned\Default_Credit_Card_toModel.csv'
df_credit_card = pd.read_csv(file)
df_credit_card.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
# Defining the target constant
TARGET = 'Default'

# Dividing train and test, ensuring consistency amount
df_train, df_test = train_test_split(df_credit_card, stratify=df_credit_card[TARGET], test_size=0.2, random_state=42)

# Dividing variables
X_train = df_train.drop(TARGET, axis=1)
y_train = df_train[TARGET]

X_test = df_test.drop(TARGET, axis=1)
y_test = df_test[TARGET]

# Adding Outliers

In [4]:
# Isolation Forest ----

# training the model
clf = IsolationForest(random_state=42)


clf.fit(X_train)

# predictions
y_out_train = clf.predict(X_train)
y_out_test = clf.predict(X_test)

In [5]:
# Creating dataset containing score Isolation Forest
X_train_outscores = X_train.copy()
X_train_outscores['score'] = clf.score_samples(X_train)

X_test_outscores = X_test.copy()
X_test_outscores['score'] = clf.score_samples(X_test)

In [6]:
# Creating dataset containing outlier flag identification
X_train_outflags = X_train.copy()
X_train_outflags['anomaly'] = y_out_train

X_test_outflags = X_test.copy()
X_test_outflags['anomaly'] = y_out_test

In [7]:
# checking consistency
# Default == 1 and outlier flag == -1 

# 6,4% are considered as outliers and default

X_train_outflags.loc[X_train_outflags['anomaly']==-1].shape[0] / ((y_train == 1).shape[0])  

0.064125

In [None]:
# Considering variables that has 0, -1 and -2 non-default 

In [21]:
payments = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

for col in payments:
    i=col[4]
    X_train_outscores[f'PAY_STATUS_{i}'] = np.where(X_train_outscores[col]>0,1,0)
    
X_train_outscores.sample(10)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,score,PAY_STATUS_0,PAY_STATUS_2,PAY_STATUS_3,PAY_STATUS_4,PAY_STATUS_5,PAY_STATUS_6
5918,180000,2,3,2,33,1,-1,2,-1,-1,-2,0,199,199,580,0,0,199,0,580,0,0,0,-0.392931,1,0,1,0,0,0
17280,20000,2,2,1,46,0,0,0,0,0,0,19113,20068,17481,15572,15902,35565,1292,1323,1111,441,2158,2400,-0.352926,0,0,0,0,0,0
1424,90000,2,1,2,33,0,0,0,0,0,0,80919,87993,17316,18026,18559,19097,9000,1500,1000,824,1000,857,-0.36857,0,0,0,0,0,0
15070,10000,1,2,1,35,0,0,0,0,0,0,5828,7717,8830,9687,10025,10400,2000,1400,1162,500,600,0,-0.350983,0,0,0,0,0,0
26685,60000,1,2,2,25,2,0,0,0,0,0,60760,60011,61333,60082,38897,39422,2740,2893,2210,938,1188,0,-0.376687,1,0,0,0,0,0
29516,70000,1,2,1,33,-1,-1,0,0,0,0,390,4924,5566,6139,5868,6399,4924,1113,1076,195,1000,235,-0.353294,0,0,0,0,0,0
21514,220000,2,2,2,25,0,0,0,0,0,0,215036,216083,217461,212995,182534,169250,7800,8200,7039,6528,6100,6200,-0.458895,0,0,0,0,0,0
22257,60000,2,1,2,29,1,-1,2,-1,-1,-1,0,351,199,3697,0,532,351,0,3697,0,532,0,-0.387437,1,0,1,0,0,0
10619,80000,1,2,2,26,-1,0,0,0,0,0,2159,4132,5949,7144,5309,500,2000,2002,2005,15,523,0,-0.352976,0,0,0,0,0,0
12022,50000,1,2,1,36,0,0,2,0,0,0,14668,17194,13948,12165,10242,8428,2800,0,500,360,500,125,-0.362511,0,0,1,0,0,0


In [25]:
X_train_outscores.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score',
       'PAY_STATUS_0', 'PAY_STATUS_2', 'PAY_STATUS_3', 'PAY_STATUS_4',
       'PAY_STATUS_5', 'PAY_STATUS_6'],
      dtype='object')

In [None]:
# Rainsing the percentage of credit limit balance

In [26]:
payments = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']

for col in payments:
    i=col[-1]
    x = f'PERC_LIMIT{i}'
    X_train_outscores[f'PERC_LIMIT{i}'] = X_train_outscores[col]/X_train_outscores['LIMIT_BAL']

X_train_outscores.sample(10)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,score,PAY_STATUS_0,PAY_STATUS_2,PAY_STATUS_3,PAY_STATUS_4,PAY_STATUS_5,PAY_STATUS_6,PERC_LIMIT1,PERC_LIMIT2,PERC_LIMIT3,PERC_LIMIT4,PERC_LIMIT5,PERC_LIMIT6
9642,200000,2,3,2,51,0,0,0,0,0,0,196875,180428,131111,112995,107300,101918,7657,4814,3921,4000,4365,11000,-0.454038,0,0,0,0,0,0,0.984375,0.90214,0.655555,0.564975,0.5365,0.50959
11086,220000,2,2,1,35,0,0,0,0,0,-1,228754,227903,93311,67900,64150,6000,10626,3600,3100,4000,6000,0,-0.430149,0,0,0,0,0,0,1.039791,1.035923,0.424141,0.308636,0.291591,0.027273
27439,150000,1,1,2,32,2,2,2,2,2,2,126115,128860,131426,132703,129080,136996,6300,6100,4900,0,10200,5100,-0.504154,1,1,1,1,1,1,0.840767,0.859067,0.876173,0.884687,0.860533,0.913307
26268,60000,2,3,1,47,0,0,0,0,0,0,33867,35303,36032,37431,37863,38954,2000,1601,2000,1354,1700,1010,-0.367859,0,0,0,0,0,0,0.56445,0.588383,0.600533,0.62385,0.63105,0.649233
2553,210000,2,1,2,26,0,0,0,0,-1,-1,9899,6000,4100,0,10790,7373,1008,1000,0,10790,7373,0,-0.371368,0,0,0,0,0,0,0.047138,0.028571,0.019524,0.0,0.051381,0.03511
11970,30000,1,2,1,47,-1,-1,0,0,0,0,778,5898,8610,8402,10878,10790,5900,3000,299,3000,608,274,-0.360365,0,0,0,0,0,0,0.025933,0.1966,0.287,0.280067,0.3626,0.359667
29521,210000,1,1,1,40,-2,-1,-1,-1,-1,-1,5411,5922,2174,6522,1532,9327,5956,2180,6568,1532,9373,8090,-0.396828,0,0,0,0,0,0,0.025767,0.0282,0.010352,0.031057,0.007295,0.044414
721,10000,1,2,2,22,0,0,0,0,0,0,7960,9649,8518,8628,9293,5033,2000,1000,500,1500,0,2500,-0.359735,0,0,0,0,0,0,0.796,0.9649,0.8518,0.8628,0.9293,0.5033
26976,360000,1,1,1,39,0,-1,-1,-1,0,-1,167685,3288,75244,6468,3169,3130,3304,75260,6484,24,3144,3458,-0.473065,0,0,0,0,0,0,0.465792,0.009133,0.209011,0.017967,0.008803,0.008694
16679,140000,1,2,2,32,1,2,2,2,2,3,67923,69412,67734,73079,75255,75757,3200,0,6500,4000,2000,3000,-0.456899,1,1,1,1,1,1,0.485164,0.4958,0.483814,0.521993,0.537536,0.541121


In [27]:
# applying in testing dataframe

payments = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

for col in payments:
    i=col[4]
    X_test_outscores[f'PAY_STATUS_{i}'] = np.where(X_test_outscores[col]>0,1,0)
    
billings = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']

for col in billings:
    i=col[-1]
    x = f'PERC_LIMIT{i}'
    X_test_outscores[f'PERC_LIMIT{i}'] = X_test_outscores[col]/X_test_outscores['LIMIT_BAL']

X_test_outscores.sample(10)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,score,PAY_STATUS_0,PAY_STATUS_2,PAY_STATUS_3,PAY_STATUS_4,PAY_STATUS_5,PAY_STATUS_6,PERC_LIMIT1,PERC_LIMIT2,PERC_LIMIT3,PERC_LIMIT4,PERC_LIMIT5,PERC_LIMIT6
23528,450000,2,1,2,36,1,-2,-1,-1,-1,-2,5909,964,613,1797,679,10643,968,615,1834,682,10697,30451,-0.423989,1,0,0,0,0,0,0.013131,0.002142,0.001362,0.003993,0.001509,0.023651
28218,20000,2,2,2,47,0,0,0,-1,-1,-2,20151,19419,19864,4723,0,0,2000,2013,4731,0,0,0,-0.379386,0,0,0,0,0,0,1.00755,0.97095,0.9932,0.23615,0.0,0.0
13837,200000,1,2,2,31,-1,-1,-1,-1,-1,-1,7179,7626,3802,1271,1489,1260,7631,3803,1271,1489,1260,1373,-0.367144,0,0,0,0,0,0,0.035895,0.03813,0.01901,0.006355,0.007445,0.0063
24575,150000,1,1,1,31,-1,-1,-2,-2,-2,-1,15000,0,0,0,0,11694,0,0,0,0,11694,30000,-0.42822,0,0,0,0,0,0,0.1,0.0,0.0,0.0,0.0,0.07796
4275,270000,1,2,1,34,2,0,0,0,0,0,275572,277585,232661,150298,134808,129645,10332,10000,18021,8026,6011,5000,-0.490321,1,0,0,0,0,0,1.020637,1.028093,0.861707,0.556659,0.499289,0.480167
14902,180000,2,2,2,24,0,0,0,0,0,0,126420,130905,136925,56669,24610,20476,6519,10000,3000,2000,2000,1000,-0.392494,0,0,0,0,0,0,0.702333,0.72725,0.760694,0.314828,0.136722,0.113756
4665,470000,1,3,1,47,0,0,0,0,0,0,163529,129341,106390,82181,79402,72121,4700,30002,20000,40000,35000,40000,-0.527887,0,0,0,0,0,0,0.347934,0.275194,0.226362,0.174853,0.16894,0.153449
1591,50000,2,2,2,33,1,2,2,0,0,0,49617,50352,49243,49819,19324,20103,1813,0,2169,702,1101,529,-0.379579,1,1,1,0,0,0,0.99234,1.00704,0.98486,0.99638,0.38648,0.40206
10324,220000,1,3,1,39,0,0,0,0,0,-1,192815,208365,88004,31237,15980,529,20000,5003,3047,5000,1000,81000,-0.458625,0,0,0,0,0,0,0.876432,0.947114,0.400018,0.141986,0.072636,0.002405
2268,30000,1,2,1,39,2,2,2,2,2,0,13888,16652,16087,17317,16900,17465,3000,0,1500,0,1000,3500,-0.417476,1,1,1,1,1,0,0.462933,0.555067,0.536233,0.577233,0.563333,0.582167


# Features Standardization 

In [29]:
# Original DataSet Features

# std_Scale = StandardScaler()

X_train_std = (X_train - X_train.mean()) / (X_train.std()) 

X_test_std = (X_test - X_test.mean()) / (X_test.std()) 

In [30]:
# Outliers DataSet

X_train_outscores_std = (X_train_outscores - X_train_outscores.mean()) / (X_train_outscores.std()) 

X_test_outscores_std = (X_test_outscores - X_test_outscores.mean()) / (X_test_outscores.std())

# ANOVA Analysis

In [31]:
# ANOVA Analysis
selected_anova = f_classif(X_train_outscores_std, y_train)
p_values_num = pd.Series(selected_anova[1])
p_values_num.index = X_train_outscores_std.columns 
p_values_num = p_values_num[p_values_num<0.05]
p_values_num.index

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score', 'PAY_STATUS_0',
       'PAY_STATUS_2', 'PAY_STATUS_3', 'PAY_STATUS_4', 'PAY_STATUS_5',
       'PAY_STATUS_6', 'PERC_LIMIT1', 'PERC_LIMIT2', 'PERC_LIMIT3',
       'PERC_LIMIT4', 'PERC_LIMIT5', 'PERC_LIMIT6'],
      dtype='object')

# CHI2 Analysis

In [35]:
# chi2 Analysis

# Because there is negative number it would not possible to run chi2 analysis for category variables

# RFE - Recursive Feature Elimination

>> The complete standardized DataSet will be used:
    - original + outliers standard values - X_train_outscores_std / X_test_outscores_std

In [32]:
category_vars = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
num_vars = ['LIMIT_BAL', 
            'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
            'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
            'score' ]

In [33]:
# Evaluating Category Variables
forest = RandomForestRegressor(n_jobs = -1, max_depth = 3)

# Random Forest based on neg_mean_squared_error 
rfecv_RFC = RFECV(estimator=forest, scoring='neg_mean_squared_error')
rfecv_RFC.fit(np.array(X_train_outscores_std[category_vars]), np.array(y_train))

RFECV(estimator=RandomForestRegressor(max_depth=3, n_jobs=-1),
      scoring='neg_mean_squared_error')

In [34]:
print(rfecv_RFC.n_features_) 
print(rfecv_RFC.support_) 
mask_RFC = rfecv_RFC.support_

7
[False  True False  True  True  True  True  True  True]


In [26]:
# Category fields selected by RFE
cols_drop_RFE= [ x for x in mask_RFC] 
cols_drop_RFE= X_train_outscores_std[category_vars].loc[:,cols_drop_RFE].columns.tolist()
cols_drop_RFE

['EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

In [35]:
# Evaluating Numeric Variables
rfecv_RFC.fit(np.array(X_train_outscores_std[num_vars]), np.array(y_train))

RFECV(estimator=RandomForestRegressor(max_depth=3, n_jobs=-1),
      scoring='neg_mean_squared_error')

In [36]:
# Numeric fields selected by RFE
print(rfecv_RFC.n_features_) 
print(rfecv_RFC.support_) 
mask_RFC = rfecv_RFC.support_
cols_drop_RFE= [ x for x in mask_RFC] 
cols_drop_RFE= X_train_outscores_std[num_vars].loc[:,cols_drop_RFE].columns.tolist()
cols_drop_RFE

10
[ True  True  True  True  True False False  True  True  True  True False
 False  True]


['LIMIT_BAL',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'score']

# Boruta

In [37]:
# processing Boruta estimator through Random Forest
boruta_selector = BorutaPy(forest, n_estimators = 50, max_iter=100, random_state = 0)

In [38]:
# fit boruta for category variables
boruta_selector.fit(np.array(X_train_outscores_std[category_vars]), np.array(y_train))
# columns selected by Boruta for category variables
X_train_outscores_std[category_vars].loc[:, boruta_selector.support_].columns.tolist()

['EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

In [39]:
# fit boruta for numeric variables
boruta_selector.fit(np.array(X_train_outscores_std[num_vars]), np.array(y_train))
# columns selected by Boruta for numeric variables
X_train_outscores_std[num_vars].loc[:, boruta_selector.support_].columns.tolist()

['LIMIT_BAL',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6',
 'score']

>> Conclusion
>> Full list for ANOVA hypothesis tests
['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score', 'PAY_STATUS_0', 'PAY_STATUS_2', 'PAY_STATUS_3', 'PAY_STATUS_4', 'PAY_STATUS_5', 'PAY_STATUS_6', 'PERC_LIMIT1', 'PERC_LIMIT2', 'PERC_LIMIT3','PERC_LIMIT4', 'PERC_LIMIT5', 'PERC_LIMIT6']
>> RFE list:
['EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'score']
>> Boruta List:
['EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score']

# Checking Best Performance by Logistic Regression

>> It will be used 2 different datasets:
   1 - original + outliers - X_train_outscores / X_test_outscores
   2 - original + outliers standard values - X_train_outscores_std / X_test_outscores_std
>> Combined wiht 3 set of chosen variables:
   1 - ANOVA
   2 - RFE
   3 - Boruta

In [41]:
logreg = LogisticRegression(random_state=8)

# 1.1 Original Data + ANOVA features

In [42]:
X_log = X_train_outscores[['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score', 'PAY_STATUS_0',
       'PAY_STATUS_2', 'PAY_STATUS_3', 'PAY_STATUS_4', 'PAY_STATUS_5',
       'PAY_STATUS_6', 'PERC_LIMIT1', 'PERC_LIMIT2', 'PERC_LIMIT3',
       'PERC_LIMIT4', 'PERC_LIMIT5', 'PERC_LIMIT6']]
logreg.fit(X_log, y_train)

X = X_test_outscores[['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score', 'PAY_STATUS_0',
       'PAY_STATUS_2', 'PAY_STATUS_3', 'PAY_STATUS_4', 'PAY_STATUS_5',
       'PAY_STATUS_6', 'PERC_LIMIT1', 'PERC_LIMIT2', 'PERC_LIMIT3',
       'PERC_LIMIT4', 'PERC_LIMIT5', 'PERC_LIMIT6']]
yhat = logreg.predict(X)

# % Accuracy 


print('Original Data with ANOVA features:')
print(35*'-')

perc_logreg_train = round(logreg.score(X_log, y_train) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set training {:.2f}".format(perc_logreg_train))

perc_logreg_test = round(accuracy_score(y_test, yhat) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set testing {:.2f}".format(perc_logreg_test))

Original Data with ANOVA features:
-----------------------------------
% Estimated Logistic Regression Accuracy for Data Set training 77.87
% Estimated Logistic Regression Accuracy for Data Set testing 77.88


In [43]:
# log loss - erro metric - vote
model_run_loss_train = log_loss(y_train, logreg.predict_proba(X_log))
model_run_loss_test = log_loss(y_test, logreg.predict_proba(X))
print('--- Original Data with ANOVA features:  ---')
print(38*'-')
print(model_run_loss_train)
print(model_run_loss_test)

--- Original Data with ANOVA features:  ---
--------------------------------------
0.5192665217430938
0.5262773767870734


# 2.1 - Standardization data + ANOVA features

In [44]:
X_log = X_train_outscores_std[['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score', 'PAY_STATUS_0',
       'PAY_STATUS_2', 'PAY_STATUS_3', 'PAY_STATUS_4', 'PAY_STATUS_5',
       'PAY_STATUS_6', 'PERC_LIMIT1', 'PERC_LIMIT2', 'PERC_LIMIT3',
       'PERC_LIMIT4', 'PERC_LIMIT5', 'PERC_LIMIT6']]
logreg.fit(X_log, y_train)

X = X_test_outscores_std[['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score', 'PAY_STATUS_0',
       'PAY_STATUS_2', 'PAY_STATUS_3', 'PAY_STATUS_4', 'PAY_STATUS_5',
       'PAY_STATUS_6', 'PERC_LIMIT1', 'PERC_LIMIT2', 'PERC_LIMIT3',
       'PERC_LIMIT4', 'PERC_LIMIT5', 'PERC_LIMIT6']]
yhat = logreg.predict(X)

# % Accuracy 


print('Standardization Data with ANOVA features:')
print(42*'-')

perc_logreg_train = round(logreg.score(X_log, y_train) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set training {:.2f}".format(perc_logreg_train))

perc_logreg_test = round(accuracy_score(y_test, yhat) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set testing {:.2f}".format(perc_logreg_test))

Standardization Data with ANOVA features:
------------------------------------------
% Estimated Logistic Regression Accuracy for Data Set training 80.89
% Estimated Logistic Regression Accuracy for Data Set testing 80.65


In [45]:
# log loss - erro metric - vote
model_run_loss_train = log_loss(y_train, logreg.predict_proba(X_log))
model_run_loss_test = log_loss(y_test, logreg.predict_proba(X))
print('--- Standardization Data with ANOVA features:  ---')
print(45*'-')
print(model_run_loss_train)
print(model_run_loss_test)

--- Standardization Data with ANOVA features:  ---
---------------------------------------------
0.44451133776820384
0.45269830993887333


# 2.1 Original Data + RFE features

In [56]:
X_log = X_train_outscores[[ 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'LIMIT_BAL', 'BILL_AMT1', 
                            'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'score' ]]
logreg.fit(X_log, y_train)

X = X_test_outscores[[ 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'LIMIT_BAL', 'BILL_AMT1',
                       'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'score' ]]

yhat = logreg.predict(X)

# % Accuracy 

print('Original Data with RFE features:')
print(32*'-')

perc_logreg_train = round(logreg.score(X_log, y_train) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set training {:.2f}".format(perc_logreg_train))

perc_logreg_test = round(accuracy_score(y_test, yhat) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set testing {:.2f}".format(perc_logreg_test))

Original Data with RFE features:
--------------------------------
% Estimated Logistic Regression Accuracy for Data Set training 77.88
% Estimated Logistic Regression Accuracy for Data Set testing 77.88


In [57]:
# log loss - erro metric - vote
model_run_loss_train = log_loss(y_train, logreg.predict_proba(X_log))
model_run_loss_test = log_loss(y_test, logreg.predict_proba(X))
print('--- Original Data with RFE features:  ---')
print(45*'-')
print(model_run_loss_train)
print(model_run_loss_test)

--- Original Data with RFE features:  ---
---------------------------------------------
0.5214292985067186
0.5298660144245338


# 2.2 Standardization Data + RFE features

In [58]:
X_log = X_train_outscores_std[[ 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'LIMIT_BAL', 'BILL_AMT1', 
                            'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'score' ]]
logreg.fit(X_log, y_train)

X = X_test_outscores_std[[ 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'LIMIT_BAL', 'BILL_AMT1',
                       'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'score' ]]

yhat = logreg.predict(X)

# % Accuracy 

print('Standardization Data with RFE features:')
print(35*'-')

perc_logreg_train = round(logreg.score(X_log, y_train) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set training {:.2f}".format(perc_logreg_train))

perc_logreg_test = round(accuracy_score(y_test, yhat) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set testing {:.2f}".format(perc_logreg_test))

Standardization Data with RFE features:
-----------------------------------
% Estimated Logistic Regression Accuracy for Data Set training 81.03
% Estimated Logistic Regression Accuracy for Data Set testing 80.70


In [59]:
# log loss - erro metric - vote
model_run_loss_train = log_loss(y_train, logreg.predict_proba(X_log))
model_run_loss_test = log_loss(y_test, logreg.predict_proba(X))
print('--- Standardization Data with RFE features:  ---')
print(50*'-')
print(model_run_loss_train)
print(model_run_loss_test)

--- Standardization Data with RFE features:  ---
--------------------------------------------------
0.4546084807103964
0.4612766178256287


# 3.1 Original Data + Boruta features

In [62]:
X_log = X_train_outscores[[ 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'LIMIT_BAL', 'BILL_AMT1', 
                            'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
                            'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score' ]]
logreg.fit(X_log, y_train)

X = X_test_outscores[[ 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'LIMIT_BAL', 'BILL_AMT1', 
                       'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
                       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score' ]]

yhat = logreg.predict(X)

# % Accuracy 


print('Original Data with Boruta features:')
print(35*'-')

perc_logreg_train = round(logreg.score(X_log, y_train) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set training {:.2f}".format(perc_logreg_train))

perc_logreg_test = round(accuracy_score(y_test, yhat) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set testing {:.2f}".format(perc_logreg_test))

Original Data with Boruta features:
-----------------------------------
% Estimated Logistic Regression Accuracy for Data Set training 77.88
% Estimated Logistic Regression Accuracy for Data Set testing 77.88


In [63]:
# log loss - erro metric - vote
model_run_loss_train = log_loss(y_train, logreg.predict_proba(X_log))
model_run_loss_test = log_loss(y_test, logreg.predict_proba(X))
print('--- Original Data with Boruta features:  ---')
print(50*'-')
print(model_run_loss_train)
print(model_run_loss_test)

--- Original Data with Boruta features:  ---
--------------------------------------------------
0.5208157351921203
0.5278936959057569


# 3.2 Standardization Data + Boruta features

In [60]:
X_log = X_train_outscores_std[[ 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'LIMIT_BAL', 'BILL_AMT1', 
                            'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
                            'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score' ]]
logreg.fit(X_log, y_train)

X = X_test_outscores_std[[ 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'LIMIT_BAL', 'BILL_AMT1', 
                            'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
                            'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score' ]]
yhat = logreg.predict(X)

# % Accuracy 


print('Standardization Data with Boruta features:')
print(42*'-')

perc_logreg_train = round(logreg.score(X_log, y_train) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set training {:.2f}".format(perc_logreg_train))

perc_logreg_test = round(accuracy_score(y_test, yhat) * 100, 2)
print("% Estimated Logistic Regression Accuracy for Data Set testing {:.2f}".format(perc_logreg_test))

Standardization Data with Boruta features:
------------------------------------------
% Estimated Logistic Regression Accuracy for Data Set training 81.02
% Estimated Logistic Regression Accuracy for Data Set testing 80.73


In [61]:
# log loss - erro metric - vote
model_run_loss_train = log_loss(y_train, logreg.predict_proba(X_log))
model_run_loss_test = log_loss(y_test, logreg.predict_proba(X))
print('--- Standardization Data with Boruta features:  ---')
print(50*'-')
print(model_run_loss_train)
print(model_run_loss_test)

--- Standardization Data with Boruta features:  ---
--------------------------------------------------
0.45249784853149294
0.4563175045773527


# Conclusion

>> All accuracies above related have practically the same measurement, althogh all variables ANOVA with standardized data has lower log loss, besides containing the same variables of Boruta and RFE.

# Delivering processed database

>> It will be used the X_train_outscores_std and X_test_outscores_std datasets:
    concatenate X_train_outscores_std and y_train
    concatenate X_test_outscores_std and y_test
    concatenate the datasets that were done before
    filter ANOVA columns plus Default (TARGET value)
    save processed .csv

In [79]:
df_train_clean_std = pd.concat([X_train_outscores_std, y_train], axis=1) # columns
df_test_clean_std = pd.concat([X_test_outscores_std, y_test], axis=1) # columns
df_clean_std = pd.concat([df_train_clean_std, df_test_clean_std], axis=0) # lines

In [80]:
df_clean_std.shape, df_train_clean_std.shape, df_test_clean_std.shape # checking consistency

((30000, 25), (24000, 25), (6000, 25))

In [81]:
# Creating final dataset
df_clean_processed_std = df_clean[[TARGET, 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 
                                   'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 
                                   'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score']]
df_clean_processed_std.shape

(30000, 23)

In [82]:
df_clean_processed_std.sample(3)

Unnamed: 0,Default,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,score
4298,0,-0.520145,0.808424,-1.131442,0.846484,-1.352069,-0.877773,-0.722309,-0.697628,0.190489,0.240255,0.256082,-0.683587,-0.691872,-0.561223,-0.541552,-0.348244,0.127982,-0.297814,-0.318068,-0.316972,0.213334,0.537612
5529,0,-0.211292,-1.236924,1.554473,0.846484,0.387958,0.012576,0.111846,0.139777,0.190489,-0.648482,-0.616925,0.88205,0.727833,-0.527541,-0.505579,0.271015,-0.240951,-0.297814,-0.247753,-0.316972,-0.292986,-0.011398
17306,0,-1.137852,0.808424,0.211516,0.846484,0.49671,-0.877773,-0.722309,-0.697628,-0.670153,-0.648482,-1.489932,-0.691137,-0.681313,-0.685105,-0.67038,-0.301924,-0.290326,-0.285081,-0.318068,-0.316972,-0.180047,0.726923


>> It will be used the X_train_outscores and X_test_outscores datasets:
    concatenate X_train_outscores_std and y_train
    concatenate X_test_outscores_std and y_test
    concatenate the datasets that were done before
    filter ANOVA columns plus Default (TARGET value)
    save processed .csv

In [83]:
df_train_clean = pd.concat([X_train_outscores, y_train], axis=1) # columns
df_test_clean = pd.concat([X_test_outscores, y_test], axis=1) # columns
df_clean = pd.concat([df_train_clean, df_test_clean], axis=0) # lines

In [84]:
df_clean.shape, df_train_clean.shape, df_test_clean.shape # checking consistency

((30000, 25), (24000, 25), (6000, 25))

In [85]:
# Creating final dataset
df_clean_processed = df_clean[[TARGET, 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 
                              'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 'PAY_AMT2', 
                              'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'score']]
df_clean_processed.shape

(30000, 23)

In [86]:
df_clean_processed.sample(3)

Unnamed: 0,Default,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,score
18604,0,60000,2,1,2,24,-1,-1,-1,-1,-1,-1,6686,1049,14020,4880,1049,14020,4880,2233,2968,8067,-0.392769
23305,0,100000,2,2,1,42,-2,-2,-2,-2,-2,-2,0,0,3168,0,0,3168,0,0,550,0,-0.399321
24727,0,470000,1,1,1,40,1,2,0,0,0,0,276880,241153,204223,151253,5014,10000,5214,4500,5000,4200,-0.488433


# Saving .csv processed

In [87]:
# saving processed standard scaled
path = 'D:\DataScience\Python\Jupyter\Desafio05\Data\Processed'
file = '\Default_Credit_Card_processedStd_toModel_1.csv'
df_clean_processed_std.to_csv(path+file, index=False)

In [88]:
# saving processed cleaned data
path = 'D:\DataScience\Python\Jupyter\Desafio05\Data\Processed'
file = '\Default_Credit_Card_processed_toModel_1.csv'
df_clean_processed.to_csv(path+file, index=False)