In [0]:
# importando bibliotecas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [0]:
# carregando dados via raw github
data = pd.read_csv("https://raw.githubusercontent.com/mp-rocha/data-projects/master/heart-disease-predict/heart-disease-predict/heart.csv")

In [3]:
# amostra de dados
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# proporção da classificação entre doentes e não doentes
data["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [5]:
# verificando missing datas
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [0]:
# definindo as variaveis explicativas e resposta

from sklearn.model_selection import train_test_split

y = data["target"]
X = data.drop(labels=["target"], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [7]:
# dimensão dos dados
X_train.shape, X_test.shape, y_train.shape, y_test.

((242, 13), (61, 13), (242,), (61,))

In [8]:
# treinamento de um modelo randomForest em sua forma padrão
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
predict_rf = rf.predict(X_test)



In [9]:
# Modelo base

from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, predict_rf)

0.7747844827586207

In [12]:
# Instalando a biblioteca responsável pelo 'grid search' aprimorado
pip install scikit-optimize

Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/f4/44/60f82c97d1caa98752c7da2c1681cab5c7a390a0fdd3a55fac672b321cac/scikit_optimize-0.5.2-py2.py3-none-any.whl (74kB)
[K     |████████████████████████████████| 81kB 3.0MB/s 
Installing collected packages: scikit-optimize
Successfully installed scikit-optimize-0.5.2


In [0]:
# criando função para treinar o modelo
# será usada como parâmetro
from skopt import dummy_minimize

def treinar_modelo(params):
  max_leaf_nodes = params[0]
  n_estimators = params[1]

  rf = RandomForestClassifier(max_leaf_nodes = max_leaf_nodes, n_estimators = n_estimators)
  rf.fit(X_train, y_train)
  predict_rf = rf.predict_proba(X_test)[:,1]

  return -roc_auc_score(y_test, predict_rf)

space = [(2, 145), (50, 1000)]

In [15]:
# Random Optimization - Busca de forma randomica os melhores parametros
resultado_random = dummy_minimize(treinar_modelo, dimensions=space, random_state=42, verbose=0)

# melhores parametros
print(resultado_random.x)

# precisão do melhor modelo
print(resultado_random.fun)

[45, 211]
-0.9461206896551725


In [0]:
# Bayesian Optimization - Busca o intervalo que possui os parametros que demonstrem melhores precisão (processos gaussianos)
# Exploration expploitation tradeoff

from skopt import gp_minimize

resultados_bayesian = gp_minimize(treinar_modelo, space, n_calls=30, n_random_starts=20, random_state=42, verbose=1)

# melhores parametros
print(resultados_bayesian.x)

# precisão do melhor modelo
print(resultados_bayesian.fun)