In [211]:
import os
import sys
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import make_scorer, precision_recall_curve, auc
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
#from sklearn.experimental import enable_halving_search_cv 
#from sklearn.model_selection import HalvingRandomSearchCV
#from sklearn.model_selection import HalvingGridSearchCV 
from collections import Counter
from imblearn.over_sampling import SMOTE

# Menu

<a name="navegacao"></a>

## 1) [Preparação dos dados](#parte1)
- 1.1 [Leitura base principal](#principal)
- 1.2 [Leitura base mes](#mes)
- 1.3 [Leitura base hora](#hora)
- 1.4 [Merge principal e base mensal](#merge1)
- 1.5 [Merge principal e base hora](#merge2)
- 1.6 [Confere marcação](#marcacao)


## 2 [Salvando as bases de treino](#parte2)
- 2.1 [Salvando base com histórico](#comhist)
- 2.2 [Salvando base sem histórico](#semhist)


<a name="principal"></a>

## 1.1) Leitura base principal


## Leitura dos dados

In [None]:
%%time
df = pd.read_csv("BNDES_UNIFICADO.csv",converters={'CNPJ8': str,'INTERMEDIARIA': str},
                 delimiter=";" , encoding='latin-1')

In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
%%time
df.head()

In [None]:
%%time
df.dtypes

In [None]:
pd.crosstab(df.PORTE_RECEITA,df.SITUACAO)

In [None]:
pd.crosstab(df.Porte_Cliente,df.SITUACAO)

In [None]:
%%time
pd.crosstab(df.CUSTO, df.SITUACAO )

In [None]:
%%time
pd.crosstab(df.NATJUR, df.SITUACAO)

In [None]:
pd.crosstab(df.EMPRESA_PUBLICA, df.SITUACAO)

In [None]:
pd.crosstab(df.INDIRETA, df.SITUACAO)

In [None]:
pd.crosstab(df.INOVACAO, df.SITUACAO)

In [None]:
pd.crosstab(df.TESOURO, df.SITUACAO)

In [None]:
pd.crosstab(df.SOCIO_PJ, df.SITUACAO)

In [None]:
%%time
df.loc[ (df['CAPITAL_SOCIAL']<1)     ,'CAPITAL_SOCIAL']    = 1
df.loc[ (df['IDADE']<1)              ,'IDADE']             = 1
df.loc[ (df['NCONTRATOS']<1)         ,'NCONTRATOS']        = 1
df.loc[ (df['NFILIAIS']<1)           ,'NFILIAIS']          = 1
df.loc[ (df['IDADE_SOCIOS']<1)       ,'IDADE_SOCIOS']      = 1
df.loc[ (df['QTDSOCIOS']<1)          ,'QTDSOCIOS']         = 1
df.loc[ (df['MEDIA_JUROS']<1)        ,'MEDIA_JUROS']       = 1
df.loc[ (df['PRAZO_AMORTIZACAO']<1)  ,'PRAZO_AMORTIZACAO'] = 1
df.loc[ (df['PRAZO_CARENCIA']<1)     ,'PRAZO_CARENCIA']    = 1
df.loc[ (df['VALOR_CONTRATO']<1)     ,'VALOR_CONTRATO']    = 1
df.loc[ (df['VALOR_DESENBOLSO']<1)   ,'VALOR_DESENBOLSO']  = 1

In [None]:
%%time
df['enc_NATJUR']        = df.NATJUR.astype("category").cat.codes
df['enc_GARANTIA']      = df.GARANTIA.astype("category").cat.codes
df['enc_INSTRUMENTO']   = df.INSTRUMENTO.astype("category").cat.codes
df['enc_CUSTO']         = df.CUSTO.astype("category").cat.codes
df['enc_PORTE_CLIENTE'] = df.Porte_Cliente.astype("category").cat.codes
df['enc_PORTE_RECEITA'] = df.PORTE_RECEITA.astype("category").cat.codes
df['enc_SITUACAO']      = df.SITUACAO.astype("category").cat.codes
df['enc_UF']            = df.UF.astype("category").cat.codes

In [None]:
%%time
df['ln_capsoc']        = np.log(df['CAPITAL_SOCIAL']+1)
df['ln_idade']         = np.log(df['IDADE']+1)
df['ln_contratos']     = np.log(df['NCONTRATOS']+1)
df['ln_filiais']       = np.log(df['NFILIAIS']+1)
df['ln_sociosage']     = np.log(df['IDADE_SOCIOS']+1)
df['ln_qtdsocios']     = np.log(df['QTDSOCIOS']+1)
df['ln_juros']         = np.log(df['MEDIA_JUROS']+1)
df['ln_amortizacao']   = np.log(df['PRAZO_AMORTIZACAO']+1)
df['ln_carencia']      = np.log(df['PRAZO_CARENCIA']+1)
df['ln_vlrcontrato']   = np.log(df['VALOR_CONTRATO']+1)
df['ln_vlrdesembolso'] = np.log(df['VALOR_DESENBOLSO']+1)

## Modelo sem histórico foi treinado com as variáveis na seguinte ordem:
['faixa_hora', 'vl_medio_mes_atual', 'dif_vl_1', 'tres_prim_dig_codbarras', 'pagador_pf', 'dif_vl_4', 'dia_do_mes', 'qtd_operacoes_mes_corrente', 'vl_medio_dia_corrente', 'sec_dig', 'qtd_operacoes_dia_corrente', 'qtd_trn_60min', 'centavos', 'dia_da_semana']

In [None]:
%%time
files = df.columns
#selected_files = files.str.contains('ln_|enc_|INDIRETA|EMPRESA_PUBLICA|INOVACAO|TESOURO|SOCIO_PJ')
selected_files = files.str.contains('ln_|INDIRETA|EMPRESA_PUBLICA|INOVACAO|TESOURO|SOCIO_PJ')
atributes = files[selected_files]

In [None]:
atributes

In [None]:
df[atributes].dtypes

In [None]:
df[atributes].describe().transpose()

## SMOTE: Synthetic Minority Oversampling Technique

In [None]:
y0 = df['SITUACAO'].values.reshape(-1, 1)
#y0 = df['SITUACAO'].values
X0 = df[atributes]

In [None]:
#import joblib
#scaler = joblib.load("scaler.saved") 

from numpy import asarray
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# transform data
X0 = scaler.fit_transform(df[atributes])

In [None]:
# base sintética: treino e teste
x_train  , x_test0 , y_train, y_test0 = train_test_split(X0, y0, test_size = 0.4, random_state=123)

# base sintética: teste e out of sample
x_test , x_out , y_test, y_out = train_test_split(x_test0, y_test0, test_size = 0.4, random_state=123)

In [None]:
# summarize the new class distribution
#counter0 = Counter(y_train)
#counter1 = Counter(y_test)
#counter2 = Counter(y_out)
#print(counter0, counter1, counter2)

In [None]:
%%time
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

gr_range = ['depthwise','lossguide']
bs_range = ['gbtree','dart']

def gbm_xgb(learning_rate, max_depth, gamma, min_child_weight, subsample, eta, reg_alpha, reg_lambda,
            n_estimators, max_delta_step, max_leaves, max_cat_threshold, grow_policy, scale_pos_weight, 
            booster , max_bin):
    model = xgb.XGBClassifier(
                  learning_rate       = learning_rate,
                  max_depth           = int(max_depth),
                  gamma               = gamma,
                  min_child_weight    = int(min_child_weight),
                  subsample           = subsample,
                  eta                 = eta,
                  reg_alpha           = reg_alpha,
                  reg_lambda          = reg_lambda,
                  n_estimators        = int(n_estimators),
                  max_delta_step      = max_delta_step,
                  max_leaves          = int(max_leaves),
                  max_bin             = int(max_bin),
                  max_cat_threshold   = int(max_cat_threshold),
                  booster             = bs_range[int(booster)],
                  grow_policy         = gr_range[int(grow_policy)],
                  scale_pos_weight    = scale_pos_weight,
                  missing             = 0,
                  random_state        = 666,
                  nthread =10 )
    
    model.fit(x_train, y_train, verbose=False)
    pred_labels = model.predict(x_test)
    return roc_auc_score(y_test, pred_labels)

params_xgb = {
    'learning_rate'          : (0.010 ,0.500),
    'max_depth'              : (2.000 ,11.00),
    'gamma'                  : (1.000 ,100.0),
    'min_child_weight'       : (1.000 ,100.0),
    'subsample'              : (0.222 ,0.999),
    'eta'                    : (0.005 ,0.500),
    'reg_alpha'              : (0.050 ,10.00),
    'reg_lambda'             : (0.050 ,10.00),
    'n_estimators'           : (50.00 ,500.0),
    'max_delta_step'         : (0.005 ,5.000),
    'max_leaves'             : (2.000 ,50.00),
    'max_bin'                : (2.000 ,100.0),
    'max_cat_threshold'      : (2.000 ,100.0),
    'booster'                : (0.001 ,1.000),
    'max_cat_threshold'      : (5.000 ,50.00),
    'grow_policy'            : (0.001 ,1.000),
    'scale_pos_weight'       : (0.222 ,100.0),
}
   
xgb0 = BayesianOptimization(f=gbm_xgb, pbounds=params_xgb, random_state=123) 
xgb0.maximize(init_points=30, n_iter=350, acq='ucb')

In [None]:
params_xgb = xgb0.max['params']
params_xgb

In [None]:
learning_rate       = xgb0.max['params']['learning_rate']
max_depth           = int(xgb0.max['params']['max_depth'])
gamma               = xgb0.max['params']['gamma']
min_child_weight    = int(xgb0.max['params']['min_child_weight'])
subsample           = xgb0.max['params']['subsample']
eta                 = xgb0.max['params']['eta']
reg_alpha           = xgb0.max['params']['reg_alpha']
reg_lambda          = xgb0.max['params']['reg_lambda']
n_estimators        = int(xgb0.max['params']['n_estimators'])
max_delta_step      = xgb0.max['params']['max_delta_step']
max_leaves          = int(xgb0.max['params']['max_leaves'])
max_bin             = int(xgb0.max['params']['max_bin'])
max_cat_threshold   = int(xgb0.max['params']['max_cat_threshold'])
booster             = bs_range[int(xgb0.max['params']['booster'])]
grow_policy         = gr_range[int(xgb0.max['params']['grow_policy'])]
scale_pos_weight    = xgb0.max['params']['scale_pos_weight']

    
print('\n learning_rate:'     , learning_rate,
      '\n max_depth:'         , max_depth,
      '\n gamma:'             , gamma,
      '\n min_child_weight:'  , min_child_weight,
      '\n subsample:'         , subsample,
      '\n eta:'               , eta,
      '\n reg_alpha:'         , reg_alpha,
      '\n reg_lambda:'        , reg_lambda,
      '\n n_estimators:'      , n_estimators,
      '\n max_delta_step:'    , max_delta_step,
      '\n max_leaves:'        , max_leaves,
      '\n max_bin:'           , max_bin,
      '\n max_cat_threshold:' , max_cat_threshold,
      '\n grow_policy:'       , grow_policy,
      '\n booster:'           , booster,
      '\n scale_pos_weight:'  , scale_pos_weight)

In [177]:
%%time
cbbb= xgb.XGBClassifier(learning_rate     = learning_rate,
                        max_depth         = max_depth,
                        gamma             = gamma,
                        min_child_weight  = min_child_weight,
                        subsample         = subsample,
                        eta               = eta,
                        reg_alpha         = reg_alpha,
                        reg_lambda        = reg_lambda,
                        n_estimators      = n_estimators,
                        max_delta_step    = max_delta_step,
                        max_leaves        = max_leaves,
                        max_bin           = max_bin,
                        max_cat_threshold = max_cat_threshold,
                        grow_policy       = grow_policy,
                        booster           = booster,
                        scale_pos_weight  = scale_pos_weight)
cbbb.fit(x_train, y_train)

CPU times: user 3min 18s, sys: 10.7 s, total: 3min 28s
Wall time: 6.54 s


In [178]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_train, cbbb.predict(x_train))) #Accuracy: 0.998776450266302
print("F1 score:" ,f1_score(        y_train, cbbb.predict(x_train))) #F1 score: 0.37065637065637064
print("Recall:"   ,recall_score(    y_train, cbbb.predict(x_train))) #Recall: 0.25
print("Precision:",precision_score( y_train, cbbb.predict(x_train))) #Precision: 0.7164179104477612

Accuracy: 0.8571883838585029
F1 score: 0.49228154652315537
Recall: 0.973305954825462
Precision: 0.3294578541640339
CPU times: user 4.27 s, sys: 237 ms, total: 4.51 s
Wall time: 141 ms


In [179]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_test, cbbb.predict(x_test))) #Accuracy: 0.998776450266302
print("F1 score:" ,f1_score(        y_test, cbbb.predict(x_test))) #F1 score: 0.37065637065637064
print("Recall:"   ,recall_score(    y_test, cbbb.predict(x_test))) #Recall: 0.25
print("Precision:",precision_score( y_test, cbbb.predict(x_test))) #Precision: 0.7164179104477612

Accuracy: 0.8529743725932811
F1 score: 0.48265389557294713
Recall: 0.9763705103969754
Precision: 0.3205585725368503
CPU times: user 1.89 s, sys: 114 ms, total: 2 s
Wall time: 63.1 ms


In [180]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_out, cbbb.predict(x_out))) #Accuracy: 0.998776450266302
print("F1 score:" ,f1_score(        y_out, cbbb.predict(x_out))) #F1 score: 0.37065637065637064
print("Recall:"   ,recall_score(    y_out, cbbb.predict(x_out))) #Recall: 0.25
print("Precision:",precision_score( y_out, cbbb.predict(x_out))) #Precision: 0.7164179104477612

Accuracy: 0.8564457501369317
F1 score: 0.48379588182632055
Recall: 0.97264218862491
Precision: 0.32197330791229745
CPU times: user 1.72 s, sys: 107 ms, total: 1.83 s
Wall time: 60.4 ms


## Leitura dos dados originais

In [181]:
%%time
df['proba'] = cbbb.predict_proba(X0)[:,1]

CPU times: user 1.02 s, sys: 24.9 ms, total: 1.05 s
Wall time: 35.9 ms


In [182]:
df[['SITUACAO','proba']].groupby(['SITUACAO']).count()

Unnamed: 0_level_0,proba
SITUACAO,Unnamed: 1_level_1
0,116654
1,8862


In [183]:
%%time
df[['SITUACAO','proba']].groupby(['SITUACAO']).describe()

CPU times: user 24.3 ms, sys: 15 µs, total: 24.3 ms
Wall time: 22.5 ms


Unnamed: 0_level_0,proba,proba,proba,proba,proba,proba,proba,proba
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
SITUACAO,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,116654.0,0.155548,0.298783,0.001258,0.005333,0.010202,0.077631,0.966269
1,8862.0,0.867053,0.122412,0.002886,0.866077,0.898643,0.920186,0.968509


In [207]:
%%time
df.loc[ df['proba']>=0.75 ,'PRED'] = 1
df.loc[ df['proba'] <0.75 ,'PRED'] = 0

CPU times: user 2.55 ms, sys: 4.13 ms, total: 6.68 ms
Wall time: 4.9 ms


<a name="metricas"></a>


# 3) Métricas
  
- ir para [Menu Principal](#navegacao)

<a name="amostra"></a>


## 3.1) Métricas na Amostra
  
- ir para [Menu Principal](#navegacao)

In [208]:
from sklearn.metrics import confusion_matrix
confusao_pop = confusion_matrix(df['SITUACAO'], df['PRED'])
print("Confusion matrix for test:\n%s" % confusao_pop )

Confusion matrix for test:
[[101582  15072]
 [   571   8291]]


In [209]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

print("Accuracy:" , accuracy_score(df['SITUACAO'], df['PRED'])) 
print("F1 score:" , f1_score(df['SITUACAO'], df['PRED'])) 
print("Recall:"   , recall_score(df['SITUACAO'], df['PRED'])) 
print("Precision:", precision_score(df['SITUACAO'], df['PRED'])) 

Accuracy: 0.8753704706969629
F1 score: 0.5145694336695112
Recall: 0.9355675919656963
Precision: 0.3548773702007448
CPU times: user 175 ms, sys: 18 µs, total: 175 ms
Wall time: 171 ms


<a name="falsopos"></a>

## 3.2) Taxa de Falso Positivo
  
- ir para [Menu Principal](#navegacao)

In [210]:
%%time
tn, fp, fn, tp = confusao_pop.ravel()
print('FPR:', fp/(fp + tn))
print('TPR:', tp/(tp + fn))

FPR: 0.12920259913933513
TPR: 0.9355675919656963
CPU times: user 138 µs, sys: 7 µs, total: 145 µs
Wall time: 152 µs
