# Random Forests 

In [None]:
!pip install scikit-learn==0.23.2

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree

In [None]:
data = load_digits(as_frame=True)
X, y, images = data.data, data.target, data.images

In [None]:
X.head()

In [None]:
X.describe()

In [None]:
y.value_counts().sort_index()

In [None]:
data.target_names

In [None]:
X_train, X_test, y_train, y_test, images_train, images_test = train_test_split(X, y, images, train_size=0.6, random_state=0)

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(10, 4))
fig.suptitle("Dados de treino")
for ax, image, label in zip(axes, images_train, y_train):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Label: %i' % label)

In [None]:
model = RandomForestClassifier(criterion="entropy", n_estimators=200, max_depth=3, random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(figsize=(11, 10))
disp = plot_confusion_matrix(model, X_test, y_test, ax=ax)
disp.figure_.suptitle("Matriz de Confusão");

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(10, 4))
fig.suptitle("Predições corretas")
for ax, image, pred, label in zip(axes, images_test[y_pred == y_test], y_pred[y_pred == y_test], y_test[y_pred == y_test]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title(f'Pred {pred}/Label {label}')

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(10, 4))
fig.suptitle("Predições erradas")
for ax, image, pred, label in zip(axes, images_test[y_pred != y_test], y_pred[y_pred != y_test], y_test[y_pred != y_test]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title(f'Pred {pred}/Label {label}')

In [None]:
# This may not the best way to view each estimator as it is small
num_trees = 4
fn = data.feature_names
cn = [str(t) for t in data.target_names]
fig, axes = plt.subplots(num_trees, 1, figsize=(16,25))
for index in range(0, num_trees):
    plot_tree(model.estimators_[index],
              feature_names=fn, 
              class_names=cn,
              filled=True,
              ax=axes[index],
              fontsize=9)
    axes[index].set_title('Estimator: ' + str(index), fontsize=15)

In [None]:
fig, ax = plt.subplots(figsize=(9, 15))
ax.barh(data.feature_names, model.feature_importances_)