In [1]:
# Importando as bibliotecas
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Importando a base de dados
base = pd.read_csv(r'Dados\base.csv')
base.head()

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
0,30941.02,1,18,35085.22134
1,40557.96,1,20,12622.05362
2,89627.5,0,12,11440.79806
3,95276.14,0,3,43167.32682
4,117384.68,1,4,12770.1129


In [3]:
# Definindo inputs e outputs
x = base[['preco', 'idade_do_modelo', 'km_por_ano']].values
y = base['vendido'].values.ravel()

In [5]:
# Os dados serão divididos em 3 etapas: 20% para validação final, 20% para teste e 60% para treino.

# Separando inicialmente os 20% para validação:
SEED=301
np.random.seed(SEED)

x_treino_teste, x_val, y_treino_teste, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, stratify=y) 

In [9]:
# Realizando agora uma busca aleatória sem validação cruzada. Para isso sera utilizada a classe stratified shuffle split, onde pode ser definido apenas 1 split:

SEED=301
np.random.seed(SEED)

parameters = {
    'max_depth': [3,4,5],
    'min_samples_split':range(32,129),
    'min_samples_leaf': range(32,129),
    'criterion': ['gini', 'entropy']
}

busca = RandomizedSearchCV(
    DecisionTreeClassifier(),
    parameters,
    cv = StratifiedShuffleSplit(n_splits=1, test_size=0.25),
    n_iter=30
)

In [10]:
# Realizando a busca:
busca.fit(x_treino_teste,y_treino_teste)
resultados_simples = pd.DataFrame(busca.cv_results_)

In [11]:
resultados_simples.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_criterion,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0105,0.0,0.001001,0.0,97,92,3,entropy,"{'min_samples_split': 97, 'min_samples_leaf': ...",0.782,0.782,0.0,7
1,0.012001,0.0,0.0015,0.0,115,112,4,entropy,"{'min_samples_split': 115, 'min_samples_leaf':...",0.782,0.782,0.0,7
2,0.012495,0.0,0.001001,0.0,57,70,5,entropy,"{'min_samples_split': 57, 'min_samples_leaf': ...",0.7825,0.7825,0.0,5
3,0.007499,0.0,0.0015,0.0,58,117,3,entropy,"{'min_samples_split': 58, 'min_samples_leaf': ...",0.782,0.782,0.0,7
4,0.0095,0.0,0.001004,0.0,68,98,5,entropy,"{'min_samples_split': 68, 'min_samples_leaf': ...",0.782,0.782,0.0,7


In [12]:
# Validando o modelo com os dados separados para validação:

SEED=301
np.random.seed(SEED)

cv_score = cross_val_score(busca, x_val, y_val, cv=StratifiedShuffleSplit(n_splits=1, test_size=0.25))

In [14]:
# Verificando o score de validação final:

print(f'Score de validação: {cv_score.mean()*100:.2f}')

Score de validação: 78.80


In [15]:
# Verificando o melhor estimador:
busca.best_estimator_