#Ensemble: RandomForest & AdaBoost

Avalia a performance da classificação da base de dados sonar com os métodos de combinação de classificadores.

Este notebook foi desenvolvido para o ambiente GOOGLE COLAB ([colab.research.google.com](https://colab.research.google.com)).

Prof. Cristiano Carvalho

-------------------------------------------------------------------------------

### Base de dados: Sonar, Mines vs. Rocks

https://archive.ics.uci.edu/ml/datasets/Connectionist+Bench+%28Sonar,+Mines+vs.+Rocks%29

208 instâncias

60 atributos

2 classes (rocha, mina)


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from sklearn import tree

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import LabelEncoder



### Carga de dados

90% da base para treinamento (187 registros)

10% da base para teste (21 regisros)

In [None]:
download_url = 'https://drive.google.com/uc?export=download&id='
url_drive_file = 'https://docs.google.com/spreadsheets/d/1cGZN3X8ydgwbbsaiQK5_\
yUwf3VTLH19W/edit?usp=sharing&ouid=114919786921075985733&rtpof=true&sd=true'

download_path = download_url + url_drive_file.split('/')[-2]

sonar = pd.read_excel(download_path, sheet_name=0)

sonar.head()


Unnamed: 0,Atributo_1,Atributo_2,Atributo_3,Atributo_4,Atributo_5,Atributo_6,Atributo_7,Atributo_8,Atributo_9,Atributo_10,...,Atributo_52,Atributo_53,Atributo_54,Atributo_55,Atributo_56,Atributo_57,Atributo_58,Atributo_59,Atributo_60,Classe
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,Rocha
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,Rocha
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,Rocha
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,Rocha
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,Rocha


In [None]:
X = sonar.iloc[:,0:(sonar.shape[1] - 1)]

le = LabelEncoder()
y = le.fit_transform(sonar.iloc[:,(sonar.shape[1] - 1)])

class_names = le.classes_
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

### Árvore de decisão tradicional

In [None]:

clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Classificador Árvore de Decisão:\n")
print("--------------------------------------")
print("Acurácia da base de treinamento: {:.2f}".format(clf.score(X_train, y_train)))
print("--------------------------------------")

print("--------------------------------------")
print("Acurácia da base de teste: {:.4f}".format(clf.score(X_test, y_test)))
print("--------------------------------------")

print(classification_report(y_test, y_pred, target_names=class_names))

cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_table = pd.DataFrame(data=cnf_matrix, index=class_names, columns=[x + "(prev)" for x in class_names])
print(cnf_table)


Classificador Árvore de Decisão:

--------------------------------------
Acurácia da base de treinamento: 1.00
--------------------------------------
--------------------------------------
Acurácia da base de teste: 0.7143
--------------------------------------
              precision    recall  f1-score   support

        Mina       0.77      0.77      0.77        13
       Rocha       0.62      0.62      0.62         8

    accuracy                           0.71        21
   macro avg       0.70      0.70      0.70        21
weighted avg       0.71      0.71      0.71        21

       Mina(prev)  Rocha(prev)
Mina           10            3
Rocha           3            5


### Random Forest e ExtraTrees

Número de estimadores: 10

In [None]:
# Random forest com 100 arvores BAGGING

clr = RandomForestClassifier(n_estimators=100, random_state=0)

clr = clr.fit(X_train, y_train)
y_pred = clr.predict(X_test)


print("Classificador Random Forest:\n RandomForestClassifier(n_estimators=10)\n")
print("--------------------------------------")
print("Acurácia da base de treinamento: {:.2f}".format(clr.score(X_train, y_train)))
print("--------------------------------------")

print("--------------------------------------")
print("Acurácia da base de teste: {:.4f}".format(clr.score(X_test, y_test)))
print("--------------------------------------")

print(classification_report(y_test, y_pred, target_names=class_names))

cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_table = pd.DataFrame(data=cnf_matrix, index=class_names, columns=[x + "(prev)" for x in class_names])
print(cnf_table)


Classificador Random Forest:
 RandomForestClassifier(n_estimators=10)

--------------------------------------
Acurácia da base de treinamento: 1.00
--------------------------------------
--------------------------------------
Acurácia da base de teste: 0.9048
--------------------------------------
              precision    recall  f1-score   support

        Mina       1.00      0.85      0.92        13
       Rocha       0.80      1.00      0.89         8

    accuracy                           0.90        21
   macro avg       0.90      0.92      0.90        21
weighted avg       0.92      0.90      0.91        21

       Mina(prev)  Rocha(prev)
Mina           11            2
Rocha           0            8


In [None]:
# ExtraTreesClassifier: Seleção aleatória das variáveis candidatas para o nó inicial,
#e os dados em cada uma destas variáveis serão separados (split) também de maneira aleatória.

cle = ExtraTreesClassifier(n_estimators=10,random_state=10)
cle = cle.fit(X_train, y_train)
y_pred = cle.predict(X_test)

print("Classificador Extreme Tree:\n ExtraTreesClassifier(n_estimators=10)\n")
print("--------------------------------------")
print("Acurácia da base de treinamento: {:.2f}".format(cle.score(X_train, y_train)))
print("--------------------------------------")

print("--------------------------------------")
print("Acurácia da base de teste: {:.4f}".format(cle.score(X_test, y_test)))
print("--------------------------------------")

print(classification_report(y_test, y_pred, target_names=class_names))

cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_table = pd.DataFrame(data=cnf_matrix, index=class_names, columns=[x + "(prev)" for x in class_names])
print(cnf_table)


Classificador Extreme Tree:
 ExtraTreesClassifier(n_estimators=10)

--------------------------------------
Acurácia da base de treinamento: 1.00
--------------------------------------
--------------------------------------
Acurácia da base de teste: 0.9524
--------------------------------------
              precision    recall  f1-score   support

        Mina       0.93      1.00      0.96        13
       Rocha       1.00      0.88      0.93         8

    accuracy                           0.95        21
   macro avg       0.96      0.94      0.95        21
weighted avg       0.96      0.95      0.95        21

       Mina(prev)  Rocha(prev)
Mina           13            0
Rocha           1            7


### AdaBoost

Estimador: DecisionTreeClassifier

Número de estimadores: 10.

In [None]:
# Adaboost com árvores mínimas BOOSTING

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4), # Modelo fraco (weak learner) -> toco (stump)
                         algorithm="SAMME", # SAMME.R
                         n_estimators=100,
                         random_state=52)

ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
print("Classificador AdaBoost:\n AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1), algorithm=\"SAMME\", n_estimators=20)\n")

print("--------------------------------------")
print("Acurácia da base de treinamento: {:.4f}".format(ada.score(X_train, y_train)))
print("--------------------------------------")

print("--------------------------------------")
print("Acurácia da base de teste: {:.4f}".format(ada.score(X_test, y_test)))
print("--------------------------------------")

print(classification_report(y_test, y_pred, target_names=class_names))

cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_table = pd.DataFrame(data=cnf_matrix, index=class_names, columns=[x + "(prev)" for x in class_names])
print(cnf_table)

Classificador AdaBoost:
 AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=20)

--------------------------------------
Acurácia da base de treinamento: 1.0000
--------------------------------------
--------------------------------------
Acurácia da base de teste: 0.9048
--------------------------------------
              precision    recall  f1-score   support

        Mina       1.00      0.85      0.92        13
       Rocha       0.80      1.00      0.89         8

    accuracy                           0.90        21
   macro avg       0.90      0.92      0.90        21
weighted avg       0.92      0.90      0.91        21

       Mina(prev)  Rocha(prev)
Mina           11            2
Rocha           0            8


In [None]:
# Extra: Tentem aplicar o algoritmo de boosting XGBoost ou GradientBoostig https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
# LightGBM
# CatBoost
# Tragam as dificuldades pra próxima aula

In [None]:
# AutoML TPOT também é bem interessante! https://machinelearningmastery.com/tpot-for-automated-machine-learning-in-python/

In [None]:
# CHAT-GPT aprecie com moderação