# Extremely Randomized Forest

In [1]:
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /opt/anaconda3/lib/python3.8/site-packages (0.24.1)


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
# Carregando o dataset
data = pd.read_excel('./data/credit.xls', skiprows = 1)

In [4]:
# Variável target
target = 'default payment next month'
y = np.asarray(data[target])

In [5]:
# Variáveis preditoras
features = data.columns.drop(['ID', target])
x = np.asarray(data[features])

In [6]:
# Dataset de treino e de teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 99)

In [7]:
# Classificador
clf = ExtraTreesClassifier(n_estimators = 500, random_state = 99)

In [8]:
# Modelo
clf.fit(x_train, y_train)

ExtraTreesClassifier(n_estimators=500, random_state=99)

In [9]:
# Score
scores = cross_val_score(clf, x_train, y_train, cv = 3, scoring = 'accuracy', n_jobs = -1)

In [10]:
# Imprimindo o resultado
print("ExtraTreesClassifier -> Acurácia: Média = %0.3f Desvio Padrão = %0.3f" % (np.mean(scores), np.std(scores)))

ExtraTreesClassifier -> Acurácia: Média = 0.812 Desvio Padrão = 0.002


In [11]:
# Fazendo previsões
y_pred = clf.predict(x_test)

In [12]:
# Confusion Matrix
_confusion_matrix = confusion_matrix(y_test, y_pred)
print(_confusion_matrix)

[[6532  446]
 [1273  749]]


In [13]:
# Acurácia
accuracy_score(y_test, y_pred)

0.809

## Otimização dos Parâmetros com Randomized Search

O Randomized Search gera amostras dos parâmetros dos algoritmos a partir de uma distribuição randômica uniforme para um número fixo de interações. Um modelo é construído e testado para cada combinação de parâmetros.

In [14]:
from sklearn.model_selection import RandomizedSearchCV

In [15]:
# Definição dos parâmetros
param_dist = {
    'max_depth': [1, 3, 7, 8, 12, None],
    'max_features': [8, 9, 10, 11, 16, 22],
    'min_samples_split': [8, 10, 11, 14, 16, 19],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7],
    'bootstrap': [True, False]
}

# Para o classificador criado na célula anterior, testamos diferentes combinações de parâmetros
rsearch = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = 25)

# Aplicando o resultado ao conjunto de dados de treino e obtendo o score
rsearch.fit(x_train, y_train)
rsearch.cv_results_

# Imprimindo o melhor estmador
bestclf = rsearch.best_estimator_
print(bestclf)

# Aplicando o melhor estimador para realizar as previsões
y_pred = bestclf.predict(x_test)

# Confusion Matrix
_confusion_matrix = confusion_matrix(y_test, y_pred)
print(_confusion_matrix)

# Acurácia
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

ExtraTreesClassifier(max_depth=7, max_features=22, min_samples_leaf=4,
                     min_samples_split=14, n_estimators=500, random_state=99)
[[6651  327]
 [1293  729]]
0.82


In [16]:
rsearch.cv_results_

{'mean_fit_time': array([ 0.8617794 ,  2.82891197,  1.15419421,  2.35185604,  4.792138  ,
         2.01504698,  2.46505451,  3.15514479,  4.82129602,  7.17904058,
         5.47414804,  1.13973179,  4.1660006 ,  1.33798723,  6.07389603,
         1.36740513,  2.97899981, 11.60043378,  2.2287848 ,  2.2996789 ,
         2.03707299,  1.48857117,  3.83514605,  3.79627948,  1.42383313]),
 'std_fit_time': array([0.00540284, 0.01336565, 0.00664007, 0.01149424, 0.01604412,
        0.01079323, 0.01590787, 0.01006008, 0.02611425, 0.06962546,
        0.08144506, 0.02028258, 0.03226233, 0.017388  , 0.03589271,
        0.01502587, 0.00888411, 0.14529175, 0.02062152, 0.01640906,
        0.02127965, 0.02289078, 0.02409919, 0.01736145, 0.0235708 ]),
 'mean_score_time': array([0.07591519, 0.12023025, 0.08575873, 0.0892005 , 0.1235146 ,
        0.12145705, 0.12931747, 0.13076138, 0.27223625, 0.29228177,
        0.27044992, 0.07449679, 0.13440523, 0.08876801, 0.27236991,
        0.0882143 , 0.16979642, 0.2