## Fonte de Dados
https://www.kaggle.com/datasets/kabure/german-credit-data-with-risk/metadata


## Instalação e Carga de Pacotes:

In [1]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Utilizada:', python_version())

Versão da Linguagem Python Utilizada: 3.10.4


In [2]:
# Filtro de warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
# Imports
import os
import copy 
import json
import statistics
import tensorflow              as     tf
import autokeras               as     ak
import numpy                   as     np
import pandas                  as     pd
import matplotlib.pyplot       as     plt 
import seaborn                 as     sns
from   sklearn.model_selection import train_test_split
from   sklearn.metrics         import accuracy_score, f1_score
import logging

logging.getLogger('tensorflow').disabled = True

2022-08-16 16:24:10.597546: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-16 16:24:10.597567: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [4]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Pedro G. Dubiela" --iversions

Author: Pedro G. Dubiela

json      : 2.0.9
seaborn   : 0.11.2
statistics: 1.0.3.5
matplotlib: 3.5.2
tensorflow: 2.9.1
logging   : 0.5.1.2
autokeras : 1.0.19
numpy     : 1.23.1
pandas    : 1.4.3



## Carga e Exploração dos Dados

In [5]:
# Carregando os arquivos
df_original = pd.read_csv("./german_credit_data.csv")
df = copy.deepcopy(df_original)

In [6]:
# Alterando nome da coluna Unnamed
df.rename(columns = {"Unnamed: 0":"ID"}, inplace = True)

In [7]:
# Procurando por valores faltantes:
# Existem valores missing nas colunas Saving accounts e Checking account
df.isnull().any()

ID                  False
Age                 False
Sex                 False
Job                 False
Housing             False
Saving accounts      True
Checking account     True
Credit amount       False
Duration            False
Purpose             False
Risk                False
dtype: bool

In [8]:
# 47% dos registros possuem campos missing, vamos substituir pela moda
df[df.isnull().T.any()].shape[0] / df.shape[0]

# Substituir pela moda
colnames = ["Saving accounts", "Checking account"]
for name in colnames:
    df.loc[df[name].isnull(), [name]] = statistics.mode(df[name][df[name].notnull()])

In [9]:
# Avaliando se as classes estão desbalanceadas:
df[["ID", "Risk"]].groupby("Risk").count()

#df_class_bad   = df[df["Risk"] == "bad"]
#df_class_good  = df[df["Risk"] == "good"]
#df_class_good2 = df_class_good.sample(df_class_bad.shape[0])
#df             = pd.concat([df_class_bad, df_class_good2], axis = 0)

Unnamed: 0_level_0,ID
Risk,Unnamed: 1_level_1
bad,300
good,700


In [10]:
# Dividino entre treino e teste
y = df.pop("Risk")
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Modelagem via AutoML

In [11]:
# Cria o objeto de busca de modelos (vamos testar 3 modelos diferentes)
clf = ak.StructuredDataClassifier(overwrite = True, max_trials = 100)

In [12]:
# Treina o objeto definindo qual é a classe (variável target)
clf.fit(X_train, y_train)

Trial 35 Complete [00h 00m 03s]
val_accuracy: 0.706250011920929

Best val_accuracy So Far: 0.71875
Total elapsed time: 00h 04m 07s


<keras.callbacks.History at 0x7fa58824cdc0>

In [13]:
# Sumário do modelo
model = clf.export_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10)]              0         
                                                                 
 multi_category_encoding (Mu  (None, 10)               0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 10)               21        
 n)                                                              
                                                                 
 dense (Dense)               (None, 32)                352       
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056  

## Avaliação do Modelo nos Dados de Teste

In [14]:
# Previsões com o melhor modelo
yp = clf.predict(X_test)



In [15]:
# Avaliação do modelo
accuracy_score(y_test, yp)

0.73

In [16]:
# Função para visualizar o resultado do AutoML
def imprimeTabela(path = 'structured_data_classifier'):
    
    # Abre o arquivo json de cada trial
    trial_json = os.popen('ls ./{}/trial_*/trial.json'.format(path)).read().split('\n')[:-1]
    
    # Lista para os dados
    DATA = []
    
    # Loop pelo arquivo json para alimentar a lista DATA
    for file in trial_json:
        with open(file) as f: 
            DATA.append(json.load(f))
    
    # Separa a lista pelos campos
    for k in range(len(DATA)):
        DATA[k]['hyperparameters']['values']['score'] = DATA[k]['score']
    
    # Cria o dataframe
    hyper_df = pd.concat([pd.DataFrame.from_dict(data['hyperparameters']['values'], orient = 'index') for data in DATA], axis=1)
    
    # Gera as colunas
    hyper_df.columns = ["trial#{}".format(k+1) for k in range(len(DATA))]
    
    return(hyper_df)

In [17]:
# Comparativo entre todas as tentativas:
display(imprimeTabela())

Unnamed: 0,trial#1,trial#2,trial#3,trial#4,trial#5,trial#6,trial#7,trial#8,trial#9,trial#10,...,trial#26,trial#27,trial#28,trial#29,trial#30,trial#31,trial#32,trial#33,trial#34,trial#35
structured_data_block_1/normalize,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
structured_data_block_1/dense_block_1/num_layers,2,2,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
structured_data_block_1/dense_block_1/use_batchnorm,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
structured_data_block_1/dense_block_1/dropout,0,0,0,0,0,0,0,0,0,0,...,0.25,0,0,0,0,0,0,0,0,0
structured_data_block_1/dense_block_1/units_0,32,32,32,32,32,32,32,32,32,32,...,32,128,64,256,32,512,1024,32,32,32
structured_data_block_1/dense_block_1/units_1,32,32,32,32,32,32,32,32,32,32,...,32,32,32,32,32,32,32,32,32,32
classification_head_1/dropout,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
optimizer,adam,adam,adam,adam,adam,adam,adam,adam_weight_decay,adam,adam,...,adam,adam,adam,adam,adam,adam,adam,adam_weight_decay,adam,adam
learning_rate,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0001,0.001
score,0.70625,0.7,0.7125,0.70625,0.70625,0.70625,0.65625,0.64375,0.7125,0.71875,...,0.7125,0.70625,0.70625,0.70625,0.70625,0.71875,0.70625,0.65,0.7125,0.70625
