# Extremely Randomized Forest

In [1]:
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /opt/anaconda3/lib/python3.8/site-packages (0.24.1)


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
# Carregando o dataset
data = pd.read_excel('./data/credit.xls', skiprows = 1)

In [4]:
# Variável target
target = 'default payment next month'
y = np.asarray(data[target])

In [5]:
# Variáveis preditoras
features = data.columns.drop(['ID', target])
x = np.asarray(data[features])

In [6]:
# Dataset de treino e de teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 99)

In [7]:
# Classificador
clf = ExtraTreesClassifier(n_estimators = 500, random_state = 99)

In [8]:
# Modelo
clf.fit(x_train, y_train)

ExtraTreesClassifier(n_estimators=500, random_state=99)

In [9]:
# Score
scores = cross_val_score(clf, x_train, y_train, cv = 3, scoring = 'accuracy', n_jobs = -1)

In [10]:
# Imprimindo o resultado
print("ExtraTreesClassifier -> Acurácia: Média = %0.3f Desvio Padrão = %0.3f" % (np.mean(scores), np.std(scores)))

ExtraTreesClassifier -> Acurácia: Média = 0.812 Desvio Padrão = 0.002


In [11]:
# Fazendo previsões
y_pred = clf.predict(x_test)

In [12]:
# Confusion Matrix
_confusion_matrix = confusion_matrix(y_test, y_pred)
print(_confusion_matrix)

[[6532  446]
 [1273  749]]


In [13]:
# Acurácia
accuracy_score(y_test, y_pred)

0.809

## Otimização dos Parâmetros com Randomized Search

O Randomized Search gera amostras dos parâmetros dos algoritmos a partir de uma distribuição randômica uniforme para um número fixo de interações. Um modelo é construído e testado para cada combinação de parâmetros.

In [14]:
from sklearn.model_selection import RandomizedSearchCV

In [15]:
# Definição dos parâmetros
param_dist = {
    'max_depth': [1, 3, 7, 8, 12, None],
    'max_features': [8, 9, 10, 11, 16, 22],
    'min_samples_split': [8, 10, 11, 14, 16, 19],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7],
    'bootstrap': [True, False]
}

# Para o classificador criado na célula anterior, testamos diferentes combinações de parâmetros
rsearch = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = 25)

# Aplicando o resultado ao conjunto de dados de treino e obtendo o score
rsearch.fit(x_train, y_train)
rsearch.cv_results_

# Imprimindo o melhor estmador
bestclf = rsearch.best_estimator_
print(bestclf)

# Aplicando o melhor estimador para realizar as previsões
y_pred = bestclf.predict(x_test)

# Confusion Matrix
_confusion_matrix = confusion_matrix(y_test, y_pred)
print(_confusion_matrix)

# Acurácia
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

ExtraTreesClassifier(bootstrap=True, max_depth=8, max_features=22,
                     min_samples_split=8, n_estimators=500, random_state=99)
[[6654  324]
 [1288  734]]
0.8208888888888889


In [16]:
rsearch.cv_results_

{'mean_fit_time': array([ 1.16590271,  5.67750726,  3.17332258,  1.39087658,  1.07060847,
         0.85585608,  1.03839827,  4.58688703,  3.73968358,  4.09802809,
         4.77679162,  1.35221643,  4.83037777,  2.95000601,  4.11074038,
         1.90570402,  2.58693485, 12.11373625,  2.32303567,  5.0761734 ,
         4.31432385,  0.86821613,  3.015976  ,  5.84362898,  4.6401155 ]),
 'std_fit_time': array([0.01499334, 0.04027128, 0.01896438, 0.00446779, 0.0102452 ,
        0.00772747, 0.01020301, 0.02057054, 0.00433722, 0.02586097,
        0.01935777, 0.00751487, 0.02912392, 0.01042506, 0.01988448,
        0.01172427, 0.01304821, 0.16047113, 0.01658516, 0.02955787,
        0.02077433, 0.01395256, 0.01416137, 0.01855262, 0.02450287]),
 'mean_score_time': array([0.07581692, 0.28871322, 0.14301872, 0.08900795, 0.07570562,
        0.0764461 , 0.07566748, 0.25169253, 0.12222066, 0.12988534,
        0.12232981, 0.08788161, 0.25260243, 0.12790947, 0.13061261,
        0.08848081, 0.12821217, 0.2

## Grid Search x Randomized Search para Estimação dos Hiperparâmetros

O Grid Search realiza metodicamente combinações entre todos os parâmetros do algoritmo, criando um grid.

In [17]:
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits

In [18]:
# Obtém o dataset
digits = load_digits()
x, y = digits.data, digits.target

In [19]:
# Construindo o classificador
clf = RandomForestClassifier(n_estimators = 20)

In [20]:
# Randomized Search

# Valores dos parâmetros que serão testados
param_dist = {
    'max_depth': [3, None],
    'max_features': sp_randint(1, 11),
    'min_samples_leaf': sp_randint(1, 11),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# Executando o Randomized Search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions =  param_dist, n_iter = n_iter_search)

start = time()
random_search.fit(x, y)
print("RandomizedSearchCV executou em %.2f segundos para %d combinações de candidatos a parâmetros do modelo."
     % ((time() - start), n_iter_search))

# Imprime as combinações dos parâmetros e suas respectivas médias de acurácia
random_search.cv_results_

RandomizedSearchCV executou em 2.96 segundos para 20 combinações de candidatos a parâmetros do modelo.


{'mean_fit_time': array([0.0371727 , 0.02590742, 0.01444125, 0.05625634, 0.02339063,
        0.02211781, 0.02335062, 0.03215342, 0.02758598, 0.03376198,
        0.02139378, 0.01732135, 0.031566  , 0.0373374 , 0.02164164,
        0.02230115, 0.0242578 , 0.02152739, 0.01858125, 0.02169013]),
 'std_fit_time': array([1.13027878e-03, 2.72163412e-04, 1.46533851e-04, 4.94228118e-04,
        1.98976987e-04, 6.25464957e-05, 8.73245728e-05, 3.41496758e-04,
        2.16608967e-04, 3.74425490e-04, 2.94253038e-04, 1.19390826e-04,
        5.51006095e-04, 4.92543532e-04, 1.46427646e-04, 3.65756928e-04,
        2.41705910e-04, 3.20203256e-04, 3.58615579e-04, 1.81089921e-04]),
 'mean_score_time': array([0.00229764, 0.00203414, 0.00199161, 0.00234041, 0.00203733,
        0.00205193, 0.00203347, 0.00230417, 0.00229225, 0.00228882,
        0.00198531, 0.00204277, 0.0024004 , 0.00224075, 0.00202436,
        0.00227966, 0.00201583, 0.00201149, 0.00207167, 0.00202422]),
 'std_score_time': array([3.08521380e-

In [21]:
# Grid Search

# Usando um grid completo de todos os parâmetros
# Valores dos parâmetros que serão testados
param_grid = {
    'max_depth': [3, None],
    'max_features': [1, 3, 10],
    'min_samples_leaf': [1, 3, 10],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# Executando o Grid Search
grid_search = GridSearchCV(clf, param_grid = param_grid)

start = time()
grid_search.fit(x, y)

print("GridSearchCV executou em %.2f segundos para %d combinações de candidatos a parâmetros do modelo."
     % ((time() - start), n_iter_search))

# Imprime as combinações dos parâmetros e suas respectivas médias de acurácia
grid_search.cv_results_

GridSearchCV executou em 10.42 segundos para 20 combinações de candidatos a parâmetros do modelo.


{'mean_fit_time': array([0.0192286 , 0.01859665, 0.01852241, 0.02024679, 0.02040157,
        0.02011104, 0.02493949, 0.02493315, 0.02488046, 0.02604823,
        0.02185926, 0.01992106, 0.02922196, 0.02650123, 0.02383208,
        0.04196424, 0.03914785, 0.03489099, 0.01901522, 0.01885972,
        0.01883817, 0.02089663, 0.02078795, 0.02064333, 0.02679682,
        0.02681994, 0.02661762, 0.02868838, 0.02309661, 0.02046719,
        0.03300486, 0.0295711 , 0.02559352, 0.04855795, 0.04511037,
        0.03891077, 0.01454282, 0.01450543, 0.01462002, 0.0168519 ,
        0.01669283, 0.01667662, 0.02411914, 0.02390738, 0.02395453,
        0.02561903, 0.01959801, 0.01634698, 0.03067055, 0.02703376,
        0.02347474, 0.05042782, 0.0472002 , 0.04088564, 0.01457224,
        0.01443357, 0.01462831, 0.01722293, 0.01728702, 0.01714873,
        0.02612267, 0.02609158, 0.02600026, 0.02902699, 0.02138586,
        0.01749973, 0.03552384, 0.03104587, 0.0256546 , 0.05890303,
        0.05468178, 0.04619327]