In [104]:
import os
import sys
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import make_scorer, precision_recall_curve, auc
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
#from sklearn.experimental import enable_halving_search_cv 
#from sklearn.model_selection import HalvingRandomSearchCV
#from sklearn.model_selection import HalvingGridSearchCV 
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn_extensions.extreme_learning_machines.elm import ELMClassifier, ELMRegressor, GenELMClassifier, GenELMRegressor
from sklearn_extensions.extreme_learning_machines.random_layer import RBFRandomLayer, MLPRandomLayer, GRBFRandomLayer, RandomLayer

def ks_stat(y, yhat):
    return ks_2samp(yhat[y==1], yhat[y!=1]).statistic

# Menu

<a name="navegacao"></a>

## 1) [Preparação dos dados](#parte1)
- 1.1 [Leitura base principal](#principal)
- 1.2 [Leitura base mes](#mes)
- 1.3 [Leitura base hora](#hora)
- 1.4 [Merge principal e base mensal](#merge1)
- 1.5 [Merge principal e base hora](#merge2)
- 1.6 [Confere marcação](#marcacao)


## 2 [Salvando as bases de treino](#parte2)
- 2.1 [Salvando base com histórico](#comhist)
- 2.2 [Salvando base sem histórico](#semhist)


<a name="principal"></a>

## 1.1) Leitura base principal


## Leitura dos dados

In [3]:
%%time
df = pd.read_csv("BNDES_UNIFICADO.csv",converters={'CNPJ8': str,'INTERMEDIARIA': str},  delimiter=";" , encoding='latin-1')

CPU times: total: 484 ms
Wall time: 485 ms


In [4]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [5]:
df.shape

(125516, 27)

In [6]:
df.columns

Index(['CNPJ8', 'ANO', 'EMPRESA_PUBLICA', 'GARANTIA', 'INDIRETA', 'INOVACAO',
       'INSTRUMENTO', 'UF', 'INTERMEDIARIA', 'CUSTO', 'MEDIA_JUROS',
       'PRAZO_AMORTIZACAO', 'TESOURO', 'PRAZO_CARENCIA', 'VALOR_CONTRATO',
       'VALOR_DESENBOLSO', 'Porte_Cliente', 'CAPITAL_SOCIAL', 'IDADE',
       'NATJUR', 'NCONTRATOS', 'NFILIAIS', 'PORTE_RECEITA', 'SITUACAO',
       'IDADE_SOCIOS', 'QTDSOCIOS', 'SOCIO_PJ'],
      dtype='object')

In [7]:
%%time
df.head()

CPU times: total: 0 ns
Wall time: 0 ns


Unnamed: 0,CNPJ8,ANO,EMPRESA_PUBLICA,GARANTIA,INDIRETA,INOVACAO,INSTRUMENTO,UF,INTERMEDIARIA,CUSTO,...,CAPITAL_SOCIAL,IDADE,NATJUR,NCONTRATOS,NFILIAIS,PORTE_RECEITA,SITUACAO,IDADE_SOCIOS,QTDSOCIOS,SOCIO_PJ
0,0,2002,0,OUTROS,0,1,OUTROS,IE,0.0,TJLP,...,90000020000.0,37.0,2.0,15,5089,5.0,1,1.0,1,0
1,0,2003,0,OUTROS,0,0,OUTROS,IE,92816560.0,TJLP,...,90000020000.0,38.0,2.0,2,5123,5.0,1,1.0,1,0
2,0,2009,1,SEM GARANTIA,0,0,OUTROS,RJ,,TAXAFIXA,...,90000020000.0,44.0,2.0,5,6912,5.0,1,1.0,1,0
3,0,2010,1,SEM GARANTIA,0,0,OUTROS,RJ,,TAXAFIXA,...,90000020000.0,45.0,2.0,2,7002,5.0,1,1.0,1,0
4,0,2012,1,MISTA,0,0,OUTROS,IE,,TJLP,...,90000020000.0,47.0,2.0,1,7407,5.0,1,1.0,1,0


In [8]:
%%time
df.dtypes

CPU times: total: 0 ns
Wall time: 0 ns


CNPJ8                 object
ANO                    int64
EMPRESA_PUBLICA        int64
GARANTIA              object
INDIRETA               int64
INOVACAO               int64
INSTRUMENTO           object
UF                    object
INTERMEDIARIA         object
CUSTO                 object
MEDIA_JUROS          float64
PRAZO_AMORTIZACAO    float64
TESOURO                int64
PRAZO_CARENCIA       float64
VALOR_CONTRATO       float64
VALOR_DESENBOLSO     float64
Porte_Cliente         object
CAPITAL_SOCIAL       float64
IDADE                float64
NATJUR               float64
NCONTRATOS             int64
NFILIAIS               int64
PORTE_RECEITA        float64
SITUACAO               int64
IDADE_SOCIOS         float64
QTDSOCIOS              int64
SOCIO_PJ               int64
dtype: object

In [9]:
pd.crosstab(df.PORTE_RECEITA,df.SITUACAO)

SITUACAO,0,1
PORTE_RECEITA,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,72792,2639
3.0,22618,1864
5.0,21244,4359


In [10]:
pd.crosstab(df.Porte_Cliente,df.SITUACAO)

SITUACAO,0,1
Porte_Cliente,Unnamed: 1_level_1,Unnamed: 2_level_1
GRANDE,3692,1219
MICRO,40280,1110
MÉDIA,20556,3430
PEQUENA,52126,3103


In [11]:
%%time
pd.crosstab(df.CUSTO, df.SITUACAO )

CPU times: total: 62.5 ms
Wall time: 36.3 ms


SITUACAO,0,1
CUSTO,Unnamed: 1_level_1,Unnamed: 2_level_1
CDI,14,2
IPCA,48,8
OUTROS,248,119
SELIC,28761,1823
TAXAFIXA,30327,1840
TJLP,20437,2296
TLP,36819,2774


In [12]:
%%time
pd.crosstab(df.NATJUR, df.SITUACAO)

CPU times: total: 31.2 ms
Wall time: 33 ms


SITUACAO,0,1
NATJUR,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,184,49
2.0,116254,8786
3.0,187,21
4.0,29,6


In [13]:
pd.crosstab(df.EMPRESA_PUBLICA, df.SITUACAO)

SITUACAO,0,1
EMPRESA_PUBLICA,Unnamed: 1_level_1,Unnamed: 2_level_1
0,116375,8770
1,279,92


In [14]:
pd.crosstab(df.INDIRETA, df.SITUACAO)

SITUACAO,0,1
INDIRETA,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2711,792
1,113943,8070


In [15]:
pd.crosstab(df.INOVACAO, df.SITUACAO)

SITUACAO,0,1
INOVACAO,Unnamed: 1_level_1,Unnamed: 2_level_1
0,116310,8730
1,344,132


In [16]:
pd.crosstab(df.TESOURO, df.SITUACAO)

SITUACAO,0,1
TESOURO,Unnamed: 1_level_1,Unnamed: 2_level_1
0,110155,7886
1,6499,976


In [17]:
pd.crosstab(df.SOCIO_PJ, df.SITUACAO)

SITUACAO,0,1
SOCIO_PJ,Unnamed: 1_level_1,Unnamed: 2_level_1
0,116654,8862


In [18]:
%%time
df.loc[ (df['CAPITAL_SOCIAL']<1)     ,'CAPITAL_SOCIAL']    = 1
df.loc[ (df['IDADE']<1)              ,'IDADE']             = 1
df.loc[ (df['NCONTRATOS']<1)         ,'NCONTRATOS']        = 1
df.loc[ (df['NFILIAIS']<1)           ,'NFILIAIS']          = 1
df.loc[ (df['IDADE_SOCIOS']<1)       ,'IDADE_SOCIOS']      = 1
df.loc[ (df['QTDSOCIOS']<1)          ,'QTDSOCIOS']         = 1
df.loc[ (df['MEDIA_JUROS']<1)        ,'MEDIA_JUROS']       = 1
df.loc[ (df['PRAZO_AMORTIZACAO']<1)  ,'PRAZO_AMORTIZACAO'] = 1
df.loc[ (df['PRAZO_CARENCIA']<1)     ,'PRAZO_CARENCIA']    = 1
df.loc[ (df['VALOR_CONTRATO']<1)     ,'VALOR_CONTRATO']    = 1
df.loc[ (df['VALOR_DESENBOLSO']<1)   ,'VALOR_DESENBOLSO']  = 1

CPU times: total: 0 ns
Wall time: 13 ms


In [19]:
%%time
df['enc_NATJUR']        = df.NATJUR.astype("category").cat.codes
df['enc_GARANTIA']      = df.GARANTIA.astype("category").cat.codes
df['enc_INSTRUMENTO']   = df.INSTRUMENTO.astype("category").cat.codes
df['enc_CUSTO']         = df.CUSTO.astype("category").cat.codes
df['enc_PORTE_CLIENTE'] = df.Porte_Cliente.astype("category").cat.codes
df['enc_PORTE_RECEITA'] = df.PORTE_RECEITA.astype("category").cat.codes
df['enc_SITUACAO']      = df.SITUACAO.astype("category").cat.codes
df['enc_UF']            = df.UF.astype("category").cat.codes

CPU times: total: 62.5 ms
Wall time: 57.3 ms


In [20]:
%%time
df['ln_capsoc']        = np.log(df['CAPITAL_SOCIAL']+1)
df['ln_idade']         = np.log(df['IDADE']+1)
df['ln_contratos']     = np.log(df['NCONTRATOS']+1)
df['ln_filiais']       = np.log(df['NFILIAIS']+1)
df['ln_sociosage']     = np.log(df['IDADE_SOCIOS']+1)
df['ln_qtdsocios']     = np.log(df['QTDSOCIOS']+1)
df['ln_juros']         = np.log(df['MEDIA_JUROS']+1)
df['ln_amortizacao']   = np.log(df['PRAZO_AMORTIZACAO']+1)
df['ln_carencia']      = np.log(df['PRAZO_CARENCIA']+1)
df['ln_vlrcontrato']   = np.log(df['VALOR_CONTRATO']+1)
df['ln_vlrdesembolso'] = np.log(df['VALOR_DESENBOLSO']+1)

CPU times: total: 31.2 ms
Wall time: 30.9 ms


In [21]:
import numpy as np
df['VALOR_CONTRATO'].dropna().quantile([0.0, 0.10, 0.25, 0.50, 0.75, 0.90])

0.00        400.00
0.10      20000.00
0.25      50000.00
0.50     130000.00
0.75     360275.75
0.90    1074983.50
Name: VALOR_CONTRATO, dtype: float64

In [22]:
def categ(row):
    if row['VALOR_CONTRATO']<=20000:
        return 0
    elif row['VALOR_CONTRATO']<=50000:
        return 1
    elif row['VALOR_CONTRATO']<=130000:
        return 2
    elif row['VALOR_CONTRATO']<=360000:
        return 3
    elif row['VALOR_CONTRATO']<=1000000:
        return 4
    else:
        return 5

In [23]:
df['combo'] = df.apply(categ, axis=1)

In [24]:
df[['combo','VALOR_CONTRATO','CNPJ8']].groupby(['combo']).mean()

  df[['combo','VALOR_CONTRATO','CNPJ8']].groupby(['combo']).mean()


Unnamed: 0_level_0,VALOR_CONTRATO
combo,Unnamed: 1_level_1
0,10077.23
1,36728.02
2,90570.2
3,225250.0
4,588491.8
5,61808240.0


In [25]:
%%time
for pow in range(2,5):
    for spl in range(0,7):
        df.loc[ (df['combo']==spl),'spl_contrato_{0}_{1}'.format(pow,spl)] = df['ln_vlrcontrato']**pow
        df.loc[~(df['combo']==spl),'spl_contrato_{0}_{1}'.format(pow,spl)] = 0

CPU times: total: 219 ms
Wall time: 237 ms


## Modelo sem histórico foi treinado com as variáveis na seguinte ordem:
['faixa_hora', 'vl_medio_mes_atual', 'dif_vl_1', 'tres_prim_dig_codbarras', 'pagador_pf', 'dif_vl_4', 'dia_do_mes', 'qtd_operacoes_mes_corrente', 'vl_medio_dia_corrente', 'sec_dig', 'qtd_operacoes_dia_corrente', 'qtd_trn_60min', 'centavos', 'dia_da_semana']

In [26]:
%%time
files = df.columns
selected_files = files.str.contains('ln_|spl_|INDIRETA|EMPRESA_PUBLICA|INOVACAO|TESOURO|SOCIO_PJ')
atributes = files[selected_files]

CPU times: total: 0 ns
Wall time: 0 ns


In [27]:
atributes

Index(['EMPRESA_PUBLICA', 'INDIRETA', 'INOVACAO', 'TESOURO', 'SOCIO_PJ',
       'ln_capsoc', 'ln_idade', 'ln_contratos', 'ln_filiais', 'ln_sociosage',
       'ln_qtdsocios', 'ln_juros', 'ln_amortizacao', 'ln_carencia',
       'ln_vlrcontrato', 'ln_vlrdesembolso', 'spl_contrato_2_0',
       'spl_contrato_2_1', 'spl_contrato_2_2', 'spl_contrato_2_3',
       'spl_contrato_2_4', 'spl_contrato_2_5', 'spl_contrato_2_6',
       'spl_contrato_3_0', 'spl_contrato_3_1', 'spl_contrato_3_2',
       'spl_contrato_3_3', 'spl_contrato_3_4', 'spl_contrato_3_5',
       'spl_contrato_3_6', 'spl_contrato_4_0', 'spl_contrato_4_1',
       'spl_contrato_4_2', 'spl_contrato_4_3', 'spl_contrato_4_4',
       'spl_contrato_4_5', 'spl_contrato_4_6'],
      dtype='object')

In [28]:
df[atributes].dtypes

EMPRESA_PUBLICA       int64
INDIRETA              int64
INOVACAO              int64
TESOURO               int64
SOCIO_PJ              int64
ln_capsoc           float64
ln_idade            float64
ln_contratos        float64
ln_filiais          float64
ln_sociosage        float64
ln_qtdsocios        float64
ln_juros            float64
ln_amortizacao      float64
ln_carencia         float64
ln_vlrcontrato      float64
ln_vlrdesembolso    float64
spl_contrato_2_0    float64
spl_contrato_2_1    float64
spl_contrato_2_2    float64
spl_contrato_2_3    float64
spl_contrato_2_4    float64
spl_contrato_2_5    float64
spl_contrato_2_6    float64
spl_contrato_3_0    float64
spl_contrato_3_1    float64
spl_contrato_3_2    float64
spl_contrato_3_3    float64
spl_contrato_3_4    float64
spl_contrato_3_5    float64
spl_contrato_3_6    float64
spl_contrato_4_0    float64
spl_contrato_4_1    float64
spl_contrato_4_2    float64
spl_contrato_4_3    float64
spl_contrato_4_4    float64
spl_contrato_4_5    

In [29]:
df[atributes].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
EMPRESA_PUBLICA,125516.0,0.002956,0.054287,0.0,0.0,0.0,0.0,1.0
INDIRETA,125516.0,0.972091,0.164712,0.0,1.0,1.0,1.0,1.0
INOVACAO,125516.0,0.003792,0.061465,0.0,0.0,0.0,0.0,1.0
TESOURO,125516.0,0.059554,0.23666,0.0,0.0,0.0,0.0,1.0
SOCIO_PJ,125516.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ln_capsoc,125516.0,10.674521,3.548741,0.693147,9.615872,11.002117,11.982935,26.048381
ln_idade,125516.0,1.250731,0.858364,0.693147,0.693147,0.693147,1.791759,4.718499
ln_contratos,125516.0,0.813766,0.313496,0.693147,0.693147,0.693147,0.693147,5.613128
ln_filiais,125516.0,0.876072,0.495155,0.693147,0.693147,0.693147,0.693147,8.921591
ln_sociosage,125516.0,2.572323,1.539442,0.693147,0.693147,3.610918,3.850148,4.51086


## SMOTE: Synthetic Minority Oversampling Technique

In [30]:
y0 = df['SITUACAO'].values.reshape(-1, 1)
#y0 = df['SITUACAO'].values
X0 = df[atributes]

In [31]:
#import joblib
#scaler = joblib.load("scaler.saved") 

from numpy import asarray
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# transform data
X0 = scaler.fit_transform(df[atributes])

In [32]:
# base sintética: treino e teste
x_train  , x_test0 , y_train, y_test0 = train_test_split(X0, y0, test_size = 0.4, random_state=123)

# base sintética: teste e out of sample
x_test , x_out , y_test, y_out = train_test_split(x_test0, y_test0, test_size = 0.4, random_state=123)

In [91]:
len([{'power':pow/10} for pow in range(1,252)])

251

<a name="gen_elm_rl"></a>

## 2.1) GEN Random Layer

- ir para [Menu Principal](#navegacao)

In [None]:
%%time
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from catboost import Pool, CatBoost, CatBoostClassifier
import catboost

af_range = ['tanh','sine','tribas','sigmoid','hardlim','softlim','gaussian','multiquadric','inv_multiquadric']
aa_range = [{'power':pow/10} for pow in range(1,251)]

def gbm_xgb(activation_func, n_hidden, rbf_width, alpha, activation_args):
    model = GenELMClassifier(
                  hidden_layer=RandomLayer( random_state    = 1 ,
                                            activation_func = af_range[int(activation_func)],
                                            n_hidden        = int(n_hidden),
                                            rbf_width       = rbf_width,
                                            alpha           = alpha,
                                            activation_args = aa_range[int(activation_args )]),
                   )
    model.fit(x_train, y_train)
    pred_labels = model.predict(x_test)
    return roc_auc_score(y_test, pred_labels)

params_xgb = {
    'activation_func'  : (0.000 ,8.0),
    'n_hidden'         : (20.00 ,350),
    'rbf_width'        : (0.005 ,0.9),
    'alpha'            : (0.005 ,0.9),
    'activation_args'  : (0.000 ,250)
}

xgb0 = BayesianOptimization(f=gbm_xgb, pbounds=params_xgb, random_state=123,allow_duplicate_points=True) 
xgb0.set_gp_params(alpha=1e-4)
xgb0.maximize(init_points=30, n_iter=370)

In [95]:
params_xgb = xgb0.max['params']
params_xgb

{'activation_args': 235.77831652667336,
 'activation_func': 6.313980746369444,
 'alpha': 0.1898181118253121,
 'n_hidden': 349.9158657751784,
 'rbf_width': 0.7769668484668575}

In [98]:
activation_func = af_range[int(xgb0.max['params']['activation_func'])]
n_hidden        = int(xgb0.max['params']['n_hidden'])
rbf_width       = xgb0.max['params']['rbf_width']
alpha           = xgb0.max['params']['alpha']
activation_args = aa_range[int(xgb0.max['params']['activation_args'])]

print('\n activation_func:'    , activation_func,
      '\n n_hidden:'           , n_hidden,
      '\n rbf_width:'          , rbf_width,
      '\n alpha:'              , alpha,
      '\n activation_args:'    , activation_args)


 activation_func: gaussian 
 n_hidden: 349 
 rbf_width: 0.7769668484668575 
 alpha: 0.1898181118253121 
 activation_args: {'power': 23.6}


In [99]:
%%time
cbbb = GenELMClassifier( hidden_layer=RandomLayer( random_state     =1 ,
                         activation_func  = activation_func,
                         n_hidden         = n_hidden,
                         rbf_width        = rbf_width,
                         alpha            = alpha,
                         activation_args  = activation_args )  )

cbbb.fit(x_train, y_train)

CPU times: total: 27.2 s
Wall time: 6.6 s


In [114]:
#np.exp(cbbb.decision_function(X0))/(1+np.exp(-cbbb.decision_function(X0)))
from scipy.stats import ks_2samp
def ks_stat(y, yhat):
    return ks_2samp(yhat[y==1], yhat[y!=1]).statistic

ks_stat( y_train, 
         np.exp(cbbb.decision_function(x_train))/(1+np.exp(-cbbb.decision_function(x_train)))
         )

0.0669545545917211

In [115]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
print("Accuracy:" , accuracy_score(  y_train, cbbb.predict(x_train))) 
print("F1 score:" , f1_score(        y_train, cbbb.predict(x_train))) 
print("Recall:"   , recall_score(    y_train, cbbb.predict(x_train))) 
print("Precision:", precision_score( y_train, cbbb.predict(x_train))) 
print("KS score:" , ks_stat(         y_train, cbbb.decision_function(x_train) ))
print("AUC score:", roc_auc_score(   y_train, cbbb.predict(x_train)))

Accuracy: 0.8796558180297176
F1 score: 0.19404179635393506
Recall: 0.2036587642337129
Precision: 0.18529211956521738
KS score: 0.13863860363205294
AUC score: 0.5675415847701044
CPU times: total: 16.4 s
Wall time: 11.9 s


In [116]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_test, cbbb.predict(x_test))) 
print("F1 score:" ,f1_score(        y_test, cbbb.predict(x_test))) 
print("Recall:"   ,recall_score(    y_test, cbbb.predict(x_test))) 
print("Precision:",precision_score( y_test, cbbb.predict(x_test))) 
print("KS score:" , ks_stat(        y_test, cbbb.decision_function(x_test) ))
print("AUC score:", roc_auc_score(  y_test, cbbb.predict(x_test)))

Accuracy: 0.8786017793121763
F1 score: 0.20100502512562812
Recall: 0.21739130434782608
Precision: 0.18691588785046728
KS score: 0.14926217408042752
AUC score: 0.5729737155843672
CPU times: total: 6.2 s
Wall time: 4.79 s


In [117]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_out, cbbb.predict(x_out))) 
print("F1 score:" ,f1_score(        y_out, cbbb.predict(x_out))) 
print("Recall:"   ,recall_score(    y_out, cbbb.predict(x_out))) 
print("Precision:",precision_score( y_out, cbbb.predict(x_out))) 
print("KS score:" , ks_stat(        y_out, cbbb.decision_function(x_out) ))
print("AUC score:", roc_auc_score(  y_out, cbbb.predict(x_out)))


Accuracy: 0.8831847831499278
F1 score: 0.19986357435197818
Recall: 0.210943124550036
Precision: 0.1898898250162022
KS score: 0.1471217747107887
AUC score: 0.5720383751561563
CPU times: total: 4.38 s
Wall time: 3.39 s


## Leitura dos dados originais

In [None]:
class ELMWrapper(ELMClassifier):
    def predict_proba(self, x):
        return self.decision_function(x)

from sklearn.preprocessing import LabelBinarizer
elm = ELMWrapper(binarizer=LabelBinarizer())

In [82]:
max(cbbb.decision_function(X0)), min(cbbb.decision_function(X0))

(array([1.06466694]), array([-2.30949413]))

In [86]:
%%time
#df['proba'] = cbbb.predict_proba(X0)[:,1]
df['proba'] = np.exp(cbbb.decision_function(X0))/(1+np.exp(-cbbb.decision_function(X0)))
#df['proba'] = (cbbb.decision_function(X0)-min(cbbb.decision_function(X0)))/(max(cbbb.decision_function(X0))-min(cbbb.decision_function(X0)))

CPU times: total: 4.73 s
Wall time: 3.37 s


In [87]:
df[['SITUACAO','proba']].groupby(['SITUACAO']).count()

Unnamed: 0_level_0,proba
SITUACAO,Unnamed: 1_level_1
0,116654
1,8862


In [88]:
%%time
df[['SITUACAO','proba']].groupby(['SITUACAO']).describe()

CPU times: total: 31.2 ms
Wall time: 43.9 ms


Unnamed: 0_level_0,proba,proba,proba,proba,proba,proba,proba,proba
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
SITUACAO,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,116654.0,0.499951,0.004306,0.008972,0.5,0.5,0.5,1.117434
1,8862.0,0.500606,0.025107,0.13147,0.5,0.5,0.5,2.156292


In [207]:
%%time
df.loc[ df['proba']>=0.75 ,'PRED'] = 1
df.loc[ df['proba'] <0.75 ,'PRED'] = 0

CPU times: user 2.55 ms, sys: 4.13 ms, total: 6.68 ms
Wall time: 4.9 ms


<a name="gen_mlpr"></a>

## 2.2) GEN MLPR Random Layer

- ir para [Menu Principal](#navegacao)

In [121]:
%%time
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from catboost import Pool, CatBoost, CatBoostClassifier
import catboost

af_range = ['tanh','sine','tribas','sigmoid','hardlim','softlim','gaussian','multiquadric','inv_multiquadric']
aa_range = [{'power':pow/10} for pow in range(1,252)]

def gbm_xgb(activation_func, n_hidden, activation_args):
    model = GenELMClassifier(
                  hidden_layer=MLPRandomLayer( random_state    = 1 ,
                                               activation_func = af_range[int(activation_func)],
                                               n_hidden        = int(n_hidden),
                                               activation_args = aa_range[int(activation_args )]),
                   )
    model.fit(x_train, y_train)
    pred_labels = model.predict(x_test)
    return roc_auc_score(y_test, pred_labels)

params_xgb = {
    'activation_func'  : (0.000 ,8.0),
    'n_hidden'         : (20.00 ,350),
    'activation_args'  : (0.000 ,250)
}

xgb0 = BayesianOptimization(f=gbm_xgb, pbounds=params_xgb, random_state=123,allow_duplicate_points=True) 
xgb0.set_gp_params(alpha=1e-4)
xgb0.maximize(init_points=30, n_iter=370)

|   iter    |  target   | activa... | activa... | n_hidden  |
-------------------------------------------------------------
| [0m1        [0m | [0m0.5106   [0m | [0m174.1    [0m | [0m2.289    [0m | [0m94.86    [0m |
| [95m2        [0m | [95m0.5174   [0m | [95m137.8    [0m | [95m5.756    [0m | [95m159.6    [0m |
| [95m3        [0m | [95m0.52     [0m | [95m245.2    [0m | [95m5.479    [0m | [95m178.7    [0m |
| [0m4        [0m | [0m0.5155   [0m | [0m98.03    [0m | [0m2.745    [0m | [0m260.6    [0m |
| [95m5        [0m | [95m0.5218   [0m | [95m109.6    [0m | [95m0.4774   [0m | [95m151.4    [0m |
| [0m6        [0m | [0m0.5172   [0m | [0m184.5    [0m | [0m1.46     [0m | [0m77.9     [0m |
| [0m7        [0m | [0m0.5097   [0m | [0m132.9    [0m | [0m4.255    [0m | [0m229.4    [0m |
| [0m8        [0m | [0m0.5204   [0m | [0m212.4    [0m | [0m5.796    [0m | [0m221.6    [0m |
| [0m9        [0m | [0m0.5117   [0m | [0

In [122]:
params_xgb = xgb0.max['params']
params_xgb

{'activation_args': 47.10748745136126,
 'activation_func': 3.674010999171463,
 'n_hidden': 348.7326283636427}

In [123]:
activation_func = af_range[int(xgb0.max['params']['activation_func'])]
n_hidden        = int(xgb0.max['params']['n_hidden'])
activation_args = aa_range[int(xgb0.max['params']['activation_args'])]

print('\n activation_func:'    , activation_func,
      '\n n_hidden:'           , n_hidden,
      '\n activation_args:'    , activation_args)


 activation_func: sigmoid 
 n_hidden: 348 
 activation_args: {'power': 4.8}


In [124]:
%%time
cbbb = GenELMClassifier( hidden_layer=RandomLayer( random_state  =1 ,
                         activation_func  = activation_func,
                         n_hidden         = n_hidden,
                         activation_args  = activation_args )  )

cbbb.fit(x_train, y_train)

CPU times: total: 27.2 s
Wall time: 6.27 s


In [125]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
print("Accuracy:" , accuracy_score(  y_train, cbbb.predict(x_train))) 
print("F1 score:" , f1_score(        y_train, cbbb.predict(x_train))) 
print("Recall:"   , recall_score(    y_train, cbbb.predict(x_train))) 
print("Precision:", precision_score( y_train, cbbb.predict(x_train))) 
print("KS score:" , ks_stat(         y_train, cbbb.decision_function(x_train) ))
print("AUC score:", roc_auc_score(   y_train, cbbb.predict(x_train)))

Accuracy: 0.9284547663625862
F1 score: 0.11352418558736427
Recall: 0.06440171737913011
Precision: 0.478502080443828
KS score: 0.7790013101172787
AUC score: 0.5295133015074973
CPU times: total: 14.3 s
Wall time: 10 s


In [126]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_test, cbbb.predict(x_test))) 
print("F1 score:" ,f1_score(        y_test, cbbb.predict(x_test))) 
print("Recall:"   ,recall_score(    y_test, cbbb.predict(x_test))) 
print("Precision:",precision_score( y_test, cbbb.predict(x_test))) 
print("KS score:" , ks_stat(        y_test, cbbb.decision_function(x_test) ))
print("AUC score:", roc_auc_score(  y_test, cbbb.predict(x_test)))

Accuracy: 0.9283959633514806
F1 score: 0.10978126289723485
Recall: 0.06285444234404537
Precision: 0.43322475570032576
KS score: 0.7795320699621875
AUC score: 0.52832096581641
CPU times: total: 5.89 s
Wall time: 3.99 s


In [144]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_out, cbbb.predict(x_out))) 
print("F1 score:" ,f1_score(        y_out, cbbb.predict(x_out))) 
print("Recall:"   ,recall_score(    y_out, cbbb.predict(x_out))) 
print("Precision:",precision_score( y_out, cbbb.predict(x_out))) 
print("KS score:" , ks_stat(        y_out, cbbb.decision_function(x_out)))
print("AUC score:", roc_auc_score(  y_out, cbbb.decision_function(x_out)))

Accuracy: 0.9286461186077777
F1 score: 0.11379097093382808
Recall: 0.06623470122390208
Precision: 0.40350877192982454
KS score: 0.7712198729675607
AUC score: 0.918362501899602
CPU times: total: 3.72 s
Wall time: 2.6 s


<a name="falsopos"></a>

## 3.2) Taxa de Falso Positivo
  
- ir para [Menu Principal](#navegacao)

In [129]:
%%time

#df['proba'] = cbbb.predict_proba(X0)[:,1]
df['proba'] = np.exp(cbbb.decision_function(X0))/(1+np.exp(-cbbb.decision_function(X0)))


CPU times: total: 8.5 s
Wall time: 6.29 s


In [130]:
%%time
df[['SITUACAO','proba']].groupby(['SITUACAO']).describe()

CPU times: total: 78.1 ms
Wall time: 68.8 ms


Unnamed: 0_level_0,proba,proba,proba,proba,proba,proba,proba,proba
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
SITUACAO,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,116654.0,0.129582,0.081057,0.002502,0.097595,0.100824,0.126025,5.38455
1,8862.0,0.291836,0.247643,0.042775,0.20341,0.248162,0.318468,13.275781


In [134]:
%%time
df.loc[ df['proba']>=0.15 ,'PRED'] = 1
df.loc[ df['proba'] <0.15 ,'PRED'] = 0

CPU times: total: 0 ns
Wall time: 3.99 ms


In [135]:
%%time
df[['SITUACAO','PRED']].groupby(['PRED']).count()

CPU times: total: 0 ns
Wall time: 4.99 ms


Unnamed: 0_level_0,SITUACAO
PRED,Unnamed: 1_level_1
0.0,96272
1.0,29244


In [136]:
pd.crosstab(df.SITUACAO, df.PRED)

PRED,0.0,1.0
SITUACAO,Unnamed: 1_level_1,Unnamed: 2_level_1
0,95847,20807
1,425,8437


In [140]:
metrics_list_safra = []
safras = df['ANO'].unique()
safras.sort()
i=0
for ano in safras:
    metrics = dict()
    metrics['Model Index']=i
    metrics['ANO']=ano
    valid_hue = df[df['ANO']==ano]
    print(ano)
    confusao_pop = confusion_matrix(valid_hue['SITUACAO'], valid_hue['PRED'])
    
    metrics["Accuracy"]  = accuracy_score(valid_hue['SITUACAO'] , valid_hue['PRED'])
    metrics['AUC']       = roc_auc_score(valid_hue['SITUACAO']  , valid_hue['proba'])
    metrics['KS Score']  = ks_stat(valid_hue['SITUACAO']        , valid_hue['proba'])
    metrics["F1 score"]  = f1_score(valid_hue['SITUACAO']       , valid_hue['PRED'])
    metrics["Recall"]    = recall_score(valid_hue['SITUACAO']   , valid_hue['PRED']) 
    metrics["Precision"] = precision_score(valid_hue['SITUACAO'], valid_hue['PRED'])
    metrics["False positive rate"] = confusao_pop[0][1]/(confusao_pop[0][0]+confusao_pop[0][1])
    metrics["True positive rate"]  = confusao_pop[1][1]/(confusao_pop[1][0]+confusao_pop[1][1])
    metrics_list_safra.append(metrics)
    del valid_hue, confusao_pop

2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [141]:
df_metrics = pd.DataFrame(metrics_list_safra)

In [142]:
df_metrics.to_csv('df_metrics.csv')

In [143]:
df_metrics

Unnamed: 0,Model Index,ANO,Accuracy,AUC,KS Score,F1 score,Recall,Precision,False positive rate,True positive rate
0,0,2002,0.921053,0.934848,0.793939,0.955224,0.969697,0.941176,0.4,0.969697
1,0,2003,0.75,0.814453,0.5625,0.837838,0.96875,0.738095,0.6875,0.96875
2,0,2004,0.784615,0.919048,0.819048,0.825,0.942857,0.733333,0.4,0.942857
3,0,2005,0.435185,0.67656,0.366216,0.469565,0.931034,0.313953,0.746835,0.931034
4,0,2006,0.676259,0.883594,0.725664,0.536082,1.0,0.366197,0.39823,1.0
5,0,2007,0.615385,0.836675,0.57477,0.511278,0.944444,0.350515,0.473684,0.944444
6,0,2008,0.626374,0.868593,0.670643,0.521127,1.0,0.352381,0.468966,1.0
7,0,2009,0.54023,0.833916,0.568366,0.454545,0.961538,0.297619,0.564593,0.961538
8,0,2010,0.536697,0.806097,0.487368,0.516746,0.915254,0.36,0.603774,0.915254
9,0,2011,0.631373,0.822953,0.529412,0.5,0.921569,0.343066,0.441176,0.921569
