In [1]:
import os
import sys
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import make_scorer, precision_recall_curve, auc
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
#from sklearn.experimental import enable_halving_search_cv 
#from sklearn.model_selection import HalvingRandomSearchCV
#from sklearn.model_selection import HalvingGridSearchCV 
from collections import Counter
from imblearn.over_sampling import SMOTE

  from pandas import MultiIndex, Int64Index


# Menu

<a name="navegacao"></a>

## 1) [Preparação dos dados](#parte1)
- 1.1 [Leitura base principal](#principal)
- 1.2 [Leitura base mes](#mes)
- 1.3 [Leitura base hora](#hora)
- 1.4 [Merge principal e base mensal](#merge1)
- 1.5 [Merge principal e base hora](#merge2)
- 1.6 [Confere marcação](#marcacao)


## 2 [Salvando as bases de treino](#parte2)
- 2.1 [Salvando base com histórico](#comhist)
- 2.2 [Salvando base sem histórico](#semhist)


<a name="principal"></a>

## 1.1) Leitura base principal


## Leitura dos dados

In [2]:
%%time
df = pd.read_csv("BNDES_UNIFICADO.csv",converters={'CNPJ8': str,'INTERMEDIARIA': str},
                 delimiter=";" , encoding='latin-1')

CPU times: user 584 ms, sys: 99.5 ms, total: 683 ms
Wall time: 1.69 s


In [3]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [4]:
df.shape

(125516, 27)

In [5]:
df.columns

Index(['CNPJ8', 'ANO', 'EMPRESA_PUBLICA', 'GARANTIA', 'INDIRETA', 'INOVACAO',
       'INSTRUMENTO', 'UF', 'INTERMEDIARIA', 'CUSTO', 'MEDIA_JUROS',
       'PRAZO_AMORTIZACAO', 'TESOURO', 'PRAZO_CARENCIA', 'VALOR_CONTRATO',
       'VALOR_DESENBOLSO', 'Porte_Cliente', 'CAPITAL_SOCIAL', 'IDADE',
       'NATJUR', 'NCONTRATOS', 'NFILIAIS', 'PORTE_RECEITA', 'SITUACAO',
       'IDADE_SOCIOS', 'QTDSOCIOS', 'SOCIO_PJ'],
      dtype='object')

In [6]:
%%time
df.head()

CPU times: user 266 µs, sys: 0 ns, total: 266 µs
Wall time: 271 µs


Unnamed: 0,CNPJ8,ANO,EMPRESA_PUBLICA,GARANTIA,INDIRETA,INOVACAO,INSTRUMENTO,UF,INTERMEDIARIA,CUSTO,...,CAPITAL_SOCIAL,IDADE,NATJUR,NCONTRATOS,NFILIAIS,PORTE_RECEITA,SITUACAO,IDADE_SOCIOS,QTDSOCIOS,SOCIO_PJ
0,0,2002,0,OUTROS,0,1,OUTROS,IE,0.0,TJLP,...,90000020000.0,37.0,2.0,15,5089,5.0,1,1.0,1,0
1,0,2003,0,OUTROS,0,0,OUTROS,IE,92816560.0,TJLP,...,90000020000.0,38.0,2.0,2,5123,5.0,1,1.0,1,0
2,0,2009,1,SEM GARANTIA,0,0,OUTROS,RJ,,TAXAFIXA,...,90000020000.0,44.0,2.0,5,6912,5.0,1,1.0,1,0
3,0,2010,1,SEM GARANTIA,0,0,OUTROS,RJ,,TAXAFIXA,...,90000020000.0,45.0,2.0,2,7002,5.0,1,1.0,1,0
4,0,2012,1,MISTA,0,0,OUTROS,IE,,TJLP,...,90000020000.0,47.0,2.0,1,7407,5.0,1,1.0,1,0


In [7]:
%%time
df.dtypes

CPU times: user 303 µs, sys: 40 µs, total: 343 µs
Wall time: 350 µs


CNPJ8                 object
ANO                    int64
EMPRESA_PUBLICA        int64
GARANTIA              object
INDIRETA               int64
INOVACAO               int64
INSTRUMENTO           object
UF                    object
INTERMEDIARIA         object
CUSTO                 object
MEDIA_JUROS          float64
PRAZO_AMORTIZACAO    float64
TESOURO                int64
PRAZO_CARENCIA       float64
VALOR_CONTRATO       float64
VALOR_DESENBOLSO     float64
Porte_Cliente         object
CAPITAL_SOCIAL       float64
IDADE                float64
NATJUR               float64
NCONTRATOS             int64
NFILIAIS               int64
PORTE_RECEITA        float64
SITUACAO               int64
IDADE_SOCIOS         float64
QTDSOCIOS              int64
SOCIO_PJ               int64
dtype: object

In [8]:
pd.crosstab(df.PORTE_RECEITA,df.SITUACAO)

SITUACAO,0,1
PORTE_RECEITA,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,72792,2639
3.0,22618,1864
5.0,21244,4359


In [9]:
pd.crosstab(df.Porte_Cliente,df.SITUACAO)

SITUACAO,0,1
Porte_Cliente,Unnamed: 1_level_1,Unnamed: 2_level_1
GRANDE,3692,1219
MICRO,40280,1110
MÉDIA,20556,3430
PEQUENA,52126,3103


In [10]:
%%time
pd.crosstab(df.CUSTO, df.SITUACAO )

CPU times: user 30.4 ms, sys: 24 µs, total: 30.5 ms
Wall time: 106 ms


SITUACAO,0,1
CUSTO,Unnamed: 1_level_1,Unnamed: 2_level_1
CDI,14,2
IPCA,48,8
OUTROS,248,119
SELIC,28761,1823
TAXAFIXA,30327,1840
TJLP,20437,2296
TLP,36819,2774


In [11]:
%%time
pd.crosstab(df.NATJUR, df.SITUACAO)

CPU times: user 22.6 ms, sys: 144 µs, total: 22.7 ms
Wall time: 81.3 ms


SITUACAO,0,1
NATJUR,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,184,49
2.0,116254,8786
3.0,187,21
4.0,29,6


In [12]:
pd.crosstab(df.EMPRESA_PUBLICA, df.SITUACAO)

SITUACAO,0,1
EMPRESA_PUBLICA,Unnamed: 1_level_1,Unnamed: 2_level_1
0,116375,8770
1,279,92


In [13]:
pd.crosstab(df.INDIRETA, df.SITUACAO)

SITUACAO,0,1
INDIRETA,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2711,792
1,113943,8070


In [14]:
pd.crosstab(df.INOVACAO, df.SITUACAO)

SITUACAO,0,1
INOVACAO,Unnamed: 1_level_1,Unnamed: 2_level_1
0,116310,8730
1,344,132


In [15]:
pd.crosstab(df.TESOURO, df.SITUACAO)

SITUACAO,0,1
TESOURO,Unnamed: 1_level_1,Unnamed: 2_level_1
0,110155,7886
1,6499,976


In [16]:
pd.crosstab(df.SOCIO_PJ, df.SITUACAO)

SITUACAO,0,1
SOCIO_PJ,Unnamed: 1_level_1,Unnamed: 2_level_1
0,116654,8862


In [17]:
%%time
df.loc[ (df['CAPITAL_SOCIAL']<1)     ,'CAPITAL_SOCIAL']    = 1
df.loc[ (df['IDADE']<1)              ,'IDADE']             = 1
df.loc[ (df['NCONTRATOS']<1)         ,'NCONTRATOS']        = 1
df.loc[ (df['NFILIAIS']<1)           ,'NFILIAIS']          = 1
df.loc[ (df['IDADE_SOCIOS']<1)       ,'IDADE_SOCIOS']      = 1
df.loc[ (df['QTDSOCIOS']<1)          ,'QTDSOCIOS']         = 1
df.loc[ (df['MEDIA_JUROS']<1)        ,'MEDIA_JUROS']       = 1
df.loc[ (df['PRAZO_AMORTIZACAO']<1)  ,'PRAZO_AMORTIZACAO'] = 1
df.loc[ (df['PRAZO_CARENCIA']<1)     ,'PRAZO_CARENCIA']    = 1
df.loc[ (df['VALOR_CONTRATO']<1)     ,'VALOR_CONTRATO']    = 1
df.loc[ (df['VALOR_DESENBOLSO']<1)   ,'VALOR_DESENBOLSO']  = 1

CPU times: user 18 ms, sys: 456 µs, total: 18.5 ms
Wall time: 75.1 ms


In [18]:
%%time
df['enc_NATJUR']        = df.NATJUR.astype("category").cat.codes
df['enc_GARANTIA']      = df.GARANTIA.astype("category").cat.codes
df['enc_INSTRUMENTO']   = df.INSTRUMENTO.astype("category").cat.codes
df['enc_CUSTO']         = df.CUSTO.astype("category").cat.codes
df['enc_PORTE_CLIENTE'] = df.Porte_Cliente.astype("category").cat.codes
df['enc_PORTE_RECEITA'] = df.PORTE_RECEITA.astype("category").cat.codes
df['enc_SITUACAO']      = df.SITUACAO.astype("category").cat.codes
df['enc_UF']            = df.UF.astype("category").cat.codes

CPU times: user 71.6 ms, sys: 6.12 ms, total: 77.7 ms
Wall time: 301 ms


In [19]:
%%time
df['ln_capsoc']        = np.log(df['CAPITAL_SOCIAL']+1)
df['ln_idade']         = np.log(df['IDADE']+1)
df['ln_contratos']     = np.log(df['NCONTRATOS']+1)
df['ln_filiais']       = np.log(df['NFILIAIS']+1)
df['ln_sociosage']     = np.log(df['IDADE_SOCIOS']+1)
df['ln_qtdsocios']     = np.log(df['QTDSOCIOS']+1)
df['ln_juros']         = np.log(df['MEDIA_JUROS']+1)
df['ln_amortizacao']   = np.log(df['PRAZO_AMORTIZACAO']+1)
df['ln_carencia']      = np.log(df['PRAZO_CARENCIA']+1)
df['ln_vlrcontrato']   = np.log(df['VALOR_CONTRATO']+1)
df['ln_vlrdesembolso'] = np.log(df['VALOR_DESENBOLSO']+1)

CPU times: user 39.9 ms, sys: 467 µs, total: 40.3 ms
Wall time: 85.3 ms


In [23]:
import numpy as np
df['VALOR_CONTRATO'].dropna().quantile([0.0, 0.10, 0.25, 0.50, 0.75, 0.90])

0.00        400.00
0.10      20000.00
0.25      50000.00
0.50     130000.00
0.75     360275.75
0.90    1074983.50
Name: VALOR_CONTRATO, dtype: float64

In [29]:
def categ(row):
    if row['VALOR_CONTRATO']<=20000:
        return 0
    elif row['VALOR_CONTRATO']<=50000:
        return 1
    elif row['VALOR_CONTRATO']<=130000:
        return 2
    elif row['VALOR_CONTRATO']<=360000:
        return 3
    elif row['VALOR_CONTRATO']<=1000000:
        return 4
    else:
        return 5

In [30]:
df['combo'] = df.apply(categ, axis=1) 

In [33]:
df[['combo','VALOR_CONTRATO','CNPJ8']].groupby(['combo']).mean()

  df[['combo','VALOR_CONTRATO','CNPJ8']].groupby(['combo']).mean()


Unnamed: 0_level_0,VALOR_CONTRATO
combo,Unnamed: 1_level_1
0,10077.23
1,36728.02
2,90570.2
3,225250.0
4,588491.8
5,61808240.0


In [34]:
%%time
for pow in range(2,5):
    for spl in range(0,7):
        df.loc[ (df['combo']==spl),'spl_contrato_{0}_{1}'.format(pow,spl)] = df['ln_vlrcontrato']**pow
        df.loc[~(df['combo']==spl),'spl_contrato_{0}_{1}'.format(pow,spl)] = 0

CPU times: user 276 ms, sys: 7.15 ms, total: 283 ms
Wall time: 1.08 s


## Modelo sem histórico foi treinado com as variáveis na seguinte ordem:
['faixa_hora', 'vl_medio_mes_atual', 'dif_vl_1', 'tres_prim_dig_codbarras', 'pagador_pf', 'dif_vl_4', 'dia_do_mes', 'qtd_operacoes_mes_corrente', 'vl_medio_dia_corrente', 'sec_dig', 'qtd_operacoes_dia_corrente', 'qtd_trn_60min', 'centavos', 'dia_da_semana']

In [35]:
%%time
files = df.columns
selected_files = files.str.contains('ln_|spl_|INDIRETA|EMPRESA_PUBLICA|INOVACAO|TESOURO|SOCIO_PJ')
atributes = files[selected_files]

CPU times: user 904 µs, sys: 68 µs, total: 972 µs
Wall time: 987 µs


In [36]:
atributes

Index(['EMPRESA_PUBLICA', 'INDIRETA', 'INOVACAO', 'TESOURO', 'SOCIO_PJ',
       'ln_capsoc', 'ln_idade', 'ln_contratos', 'ln_filiais', 'ln_sociosage',
       'ln_qtdsocios', 'ln_juros', 'ln_amortizacao', 'ln_carencia',
       'ln_vlrcontrato', 'ln_vlrdesembolso', 'spl_contrato_2_0',
       'spl_contrato_2_1', 'spl_contrato_2_2', 'spl_contrato_2_3',
       'spl_contrato_2_4', 'spl_contrato_2_5', 'spl_contrato_2_6',
       'spl_contrato_3_0', 'spl_contrato_3_1', 'spl_contrato_3_2',
       'spl_contrato_3_3', 'spl_contrato_3_4', 'spl_contrato_3_5',
       'spl_contrato_3_6', 'spl_contrato_4_0', 'spl_contrato_4_1',
       'spl_contrato_4_2', 'spl_contrato_4_3', 'spl_contrato_4_4',
       'spl_contrato_4_5', 'spl_contrato_4_6'],
      dtype='object')

In [37]:
df[atributes].dtypes

EMPRESA_PUBLICA       int64
INDIRETA              int64
INOVACAO              int64
TESOURO               int64
SOCIO_PJ              int64
ln_capsoc           float64
ln_idade            float64
ln_contratos        float64
ln_filiais          float64
ln_sociosage        float64
ln_qtdsocios        float64
ln_juros            float64
ln_amortizacao      float64
ln_carencia         float64
ln_vlrcontrato      float64
ln_vlrdesembolso    float64
spl_contrato_2_0    float64
spl_contrato_2_1    float64
spl_contrato_2_2    float64
spl_contrato_2_3    float64
spl_contrato_2_4    float64
spl_contrato_2_5    float64
spl_contrato_2_6    float64
spl_contrato_3_0    float64
spl_contrato_3_1    float64
spl_contrato_3_2    float64
spl_contrato_3_3    float64
spl_contrato_3_4    float64
spl_contrato_3_5    float64
spl_contrato_3_6    float64
spl_contrato_4_0    float64
spl_contrato_4_1    float64
spl_contrato_4_2    float64
spl_contrato_4_3    float64
spl_contrato_4_4    float64
spl_contrato_4_5    

In [38]:
df[atributes].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
EMPRESA_PUBLICA,125516.0,0.002956,0.054287,0.0,0.0,0.0,0.0,1.0
INDIRETA,125516.0,0.972091,0.164712,0.0,1.0,1.0,1.0,1.0
INOVACAO,125516.0,0.003792,0.061465,0.0,0.0,0.0,0.0,1.0
TESOURO,125516.0,0.059554,0.23666,0.0,0.0,0.0,0.0,1.0
SOCIO_PJ,125516.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ln_capsoc,125516.0,10.674521,3.548741,0.693147,9.615872,11.002117,11.982935,26.048381
ln_idade,125516.0,1.250731,0.858364,0.693147,0.693147,0.693147,1.791759,4.718499
ln_contratos,125516.0,0.813766,0.313496,0.693147,0.693147,0.693147,0.693147,5.613128
ln_filiais,125516.0,0.876072,0.495155,0.693147,0.693147,0.693147,0.693147,8.921591
ln_sociosage,125516.0,2.572323,1.539442,0.693147,0.693147,3.610918,3.850148,4.51086


## SMOTE: Synthetic Minority Oversampling Technique

In [39]:
y0 = df['SITUACAO'].values.reshape(-1, 1)
#y0 = df['SITUACAO'].values
X0 = df[atributes]

In [40]:
#import joblib
#scaler = joblib.load("scaler.saved") 

from numpy import asarray
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# transform data
X0 = scaler.fit_transform(df[atributes])

In [41]:
# base sintética: treino e teste
x_train  , x_test0 , y_train, y_test0 = train_test_split(X0, y0, test_size = 0.4, random_state=123)

# base sintética: teste e out of sample
x_test , x_out , y_test, y_out = train_test_split(x_test0, y_test0, test_size = 0.4, random_state=123)

In [53]:
# summarize the new class distribution
#counter0 = Counter(y_train)
#counter1 = Counter(y_test)
#counter2 = Counter(y_out)
#print(counter0, counter1, counter2)

In [None]:
%%time
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from catboost import Pool, CatBoost, CatBoostClassifier

gr_range = ['depthwise','lossguide']
bs_range = ['gbtree','dart']


def gbm_xgb(max_depth, gamma, min_child_weight, subsample, colsample_bytree, colsample_bynode, colsample_bylevel,
            eta, reg_alpha, reg_lambda, learning_rate, n_estimators, max_delta_step ):
    
    model = xgb.XGBClassifier(
                  max_depth           = int(max_depth),
                  learning_rate       = learning_rate,      
                  gamma               = gamma,
                  min_child_weight    = int(min_child_weight),
                  colsample_bytree    = colsample_bytree,
                  colsample_bynode    = colsample_bynode,
                  colsample_bylevel   = colsample_bylevel,
                  eta                 = eta,
                  reg_alpha           = reg_alpha,
                  reg_lambda          = reg_lambda,
                  subsample           = subsample,
                  n_estimators        = int(n_estimators),
                  max_delta_step      = max_delta_step,
                  missing             = 0,
                  random_state        = 666,
                  nthread             = 10 ,
                  use_label_encoder   = False,
                  eval_metric         = roc_auc_score)
    
    model.fit(x_train, y_train, verbose=False)
    pred_labels = model.predict(x_test)
    return roc_auc_score(y_test, pred_labels)

params_xgb = {
    'max_depth'                  : (3.000 ,12.00),
    'gamma'                      : (0.050 ,150.0),
    'min_child_weight'           : (0.050 ,0.850),
    'subsample'                  : (0.050 ,0.850),
    'colsample_bytree'           : (0.050 ,0.850),
    'colsample_bynode'           : (0.050 ,0.850),
    'colsample_bylevel'          : (0.050 ,0.850),
    'eta'                        : (0.050 ,0.850),
    'reg_alpha'                  : (0.050 ,150.0),
    'reg_lambda'                 : (0.050 ,150.0),
    'subsample'                  : (0.050 ,0.850),
    'learning_rate'              : (0.010 ,0.888),
    'n_estimators'               : (200.0 , 1000),
    'max_delta_step'             : (2.000 ,12.00)
}
xgb0 = BayesianOptimization(f=gbm_xgb, pbounds=params_xgb, random_state=123) 
xgb0.set_gp_params(alpha=1e-4)
xgb0.maximize(init_points=30, n_iter=370)

|   iter    |  target   | colsam... | colsam... | colsam... |    eta    |   gamma   | learni... | max_de... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.5006   [0m | [0m0.6072   [0m | [0m0.2789   [0m | [0m0.2315   [0m | [0m0.4911   [0m | [0m107.9    [0m | [0m0.3815   [0m | [0m11.81    [0m | [0m9.163    [0m | [0m0.4347   [0m | [0m513.7    [0m | [0m51.51    [0m | [0m109.4    [0m | [0m0.4009   [0m |
| [0m2        [0m | [0m0.5      [0m | [0m0.09774  [0m | [0m0.3684   [0m | [0m0.6404   [0m | [0m0.196    [0m | [0m26.36    [0m | [0m0.4767   [0m | [0m7.318    [0m | [0m8.71     [0m | [0m0.7295   [0m | [0m779.6    [0m | [0m91.67    [0m | [0m108.4    [0m | [0m0.3084   [0m |
| [95m3        [0m | [95m0.51

In [None]:
params_xgb = xgb0.max['params']
params_xgb

In [None]:
max_depth           = int(xgb0.max['params']['max_depth'])
learning_rate       = xgb0.max['params']['learning_rate']
gamma               = xgb0.max['params']['gamma']
min_child_weight    = int(xgb0.max['params']['min_child_weight'])
colsample_bytree    = xgb0.max['params']['colsample_bytree']
colsample_bynode    = xgb0.max['params']['colsample_bynode']
colsample_bylevel   = xgb0.max['params']['colsample_bylevel']
eta                 = xgb0.max['params']['eta']
reg_alpha           = xgb0.max['params']['reg_alpha']
reg_lambda          = xgb0.max['params']['reg_lambda']
subsample           = xgb0.max['params']['subsample']
n_estimators        = int(xgb0.max['params']['n_estimators'])
max_delta_step      = xgb0.max['params']['max_delta_step']
missing             = 0
random_state        = 666
nthread             = 10 
use_label_encoder   = False
    
    
print('\n max_depth:'          , max_depth,
      '\n learning_rate:'      , learning_rate,
      '\n gamma:'              , gamma,
      '\n min_child_weight:'   , min_child_weight,
      '\n colsample_bytree:'   , colsample_bytree,
      '\n colsample_bynode:'   , colsample_bynode,
      '\n colsample_bylevel:'  , colsample_bylevel,
      '\n eta:'                , eta,
      '\n reg_alpha:'          , reg_alpha,
      '\n reg_lambda:'         , reg_lambda,
      '\n subsample:'          , subsample,
      '\n n_estimators:'       , n_estimators,
      '\n max_delta_step:'     , max_delta_step)

In [None]:
%%time
cbbb= xgb.XGBClassifier(learning_rate     = learning_rate,
                        max_depth         = max_depth,
                        gamma             = gamma,
                        min_child_weight  = min_child_weight,
                        subsample         = subsample,
                        eta               = eta,
                        reg_alpha         = reg_alpha,
                        reg_lambda        = reg_lambda,
                        colsample_bytree  = colsample_bytree,
                        colsample_bynode  = colsample_bynode,
                        colsample_bylevel = colsample_bylevel,
                        n_estimators      = n_estimators,
                        max_delta_step    = max_delta_step,
                        missing           = 0
                        random_state      = 666
                        nthread           = 10 
                        use_label_encoder = False )

cbbb.fit(x_train, y_train)

In [49]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_train, cbbb.predict(x_train))) #Accuracy: 0.998776450266302
print("F1 score:" ,f1_score(        y_train, cbbb.predict(x_train))) #F1 score: 0.37065637065637064
print("Recall:"   ,recall_score(    y_train, cbbb.predict(x_train))) #Recall: 0.25
print("Precision:",precision_score( y_train, cbbb.predict(x_train))) #Precision: 0.7164179104477612

NotFittedError: need to call fit or load_model beforehand

In [50]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_test, cbbb.predict(x_test))) #Accuracy: 0.998776450266302
print("F1 score:" ,f1_score(        y_test, cbbb.predict(x_test))) #F1 score: 0.37065637065637064
print("Recall:"   ,recall_score(    y_test, cbbb.predict(x_test))) #Recall: 0.25
print("Precision:",precision_score( y_test, cbbb.predict(x_test))) #Precision: 0.7164179104477612

NotFittedError: need to call fit or load_model beforehand

In [51]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print("Accuracy:" ,accuracy_score(  y_out, cbbb.predict(x_out))) #Accuracy: 0.998776450266302
print("F1 score:" ,f1_score(        y_out, cbbb.predict(x_out))) #F1 score: 0.37065637065637064
print("Recall:"   ,recall_score(    y_out, cbbb.predict(x_out))) #Recall: 0.25
print("Precision:",precision_score( y_out, cbbb.predict(x_out))) #Precision: 0.7164179104477612


KeyboardInterrupt



## Leitura dos dados originais

In [181]:
%%time
df['proba'] = cbbb.predict_proba(X0)[:,1]

CPU times: user 1.02 s, sys: 24.9 ms, total: 1.05 s
Wall time: 35.9 ms


In [182]:
df[['SITUACAO','proba']].groupby(['SITUACAO']).count()

Unnamed: 0_level_0,proba
SITUACAO,Unnamed: 1_level_1
0,116654
1,8862


In [183]:
%%time
df[['SITUACAO','proba']].groupby(['SITUACAO']).describe()

CPU times: user 24.3 ms, sys: 15 µs, total: 24.3 ms
Wall time: 22.5 ms


Unnamed: 0_level_0,proba,proba,proba,proba,proba,proba,proba,proba
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
SITUACAO,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,116654.0,0.155548,0.298783,0.001258,0.005333,0.010202,0.077631,0.966269
1,8862.0,0.867053,0.122412,0.002886,0.866077,0.898643,0.920186,0.968509


In [207]:
%%time
df.loc[ df['proba']>=0.75 ,'PRED'] = 1
df.loc[ df['proba'] <0.75 ,'PRED'] = 0

CPU times: user 2.55 ms, sys: 4.13 ms, total: 6.68 ms
Wall time: 4.9 ms


<a name="metricas"></a>


# 3) Métricas
  
- ir para [Menu Principal](#navegacao)

<a name="amostra"></a>


## 3.1) Métricas na Amostra
  
- ir para [Menu Principal](#navegacao)

In [208]:
from sklearn.metrics import confusion_matrix
confusao_pop = confusion_matrix(df['SITUACAO'], df['PRED'])
print("Confusion matrix for test:\n%s" % confusao_pop )

Confusion matrix for test:
[[101582  15072]
 [   571   8291]]


In [209]:
%%time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

print("Accuracy:" , accuracy_score(df['SITUACAO'], df['PRED'])) 
print("F1 score:" , f1_score(df['SITUACAO'], df['PRED'])) 
print("Recall:"   , recall_score(df['SITUACAO'], df['PRED'])) 
print("Precision:", precision_score(df['SITUACAO'], df['PRED'])) 

Accuracy: 0.8753704706969629
F1 score: 0.5145694336695112
Recall: 0.9355675919656963
Precision: 0.3548773702007448
CPU times: user 175 ms, sys: 18 µs, total: 175 ms
Wall time: 171 ms


<a name="falsopos"></a>

## 3.2) Taxa de Falso Positivo
  
- ir para [Menu Principal](#navegacao)

In [210]:
%%time
tn, fp, fn, tp = confusao_pop.ravel()
print('FPR:', fp/(fp + tn))
print('TPR:', tp/(tp + fn))

FPR: 0.12920259913933513
TPR: 0.9355675919656963
CPU times: user 138 µs, sys: 7 µs, total: 145 µs
Wall time: 152 µs
