# Random Forest

* ### Random Forest Explaination

<img src="randomforest.png" width="650"/>

* ### Prediction:

    #### Classification: by majority voting

    #### Regression: by averaging

## Loading Data

In [None]:
import numpy as np
from sklearn.datasets import load_wine

In [None]:
# Load wine dataset from sklearn
wine = load_wine()
X = wine.data
y = wine.target

## Fitting and Testing Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Split the dataset into 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
############################################ Task 3 ############################################
# Train a random forest calssifier using RandomForestClassifier with 20 trees
# and estimate its generalization error
# ----------------------------------------- start here -----------------------------------------

# Instantiate a random forest classifier
rfc = RandomForestClassifier(...)

# Fit the random forest classifier to the training set
...

# Predict the test set labels
y_pred = ...

# Print the accuracy of the test set
accuracy = ...
print("Accuracy:", accuracy)

## Multiclass Classification Metrics

* ### Precision, recall, F1 score

<img src="metrics.png"/>

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
############################################ Task 3 ############################################
# Display the confusion matrix for the predictions
# ----------------------------------------- start here -----------------------------------------

# Plot confusion matrix
fig, ax = plt.subplots(figsize=(3, 3))
cmp = ...(...,display_labels=["class_0", "class_1", "class_2"],)
cmp.plot(ax=ax)
plt.show()

* ### One-vs-Rest (OVR, also called One-vs-All) approach

In [None]:
from sklearn.metrics import classification_report

In [None]:
############################################ Task 3 ############################################
# Check precision, recall and f1-score of the predictions using classification report
# ----------------------------------------- start here -----------------------------------------

print(...(..., ..., target_names=...))

   * ### OOB_Score (Out Of Bag Score)

In [None]:
############################################ Task 3 ############################################
# Train a random forest calssifier using RandomForestClassifier with 20 trees again
# print both test set accuracy and OOB accuracy
# ----------------------------------------- start here -----------------------------------------

# Instantiate rfc
rfc = RandomForestClassifier(n_estimators=20, ...)

# Fit the random forest classifier to the training set
...

# Predict the test set labels
...

# Evaluate the accuracy of the test set
acc_test = ...

# Evaluate OOB accuracy
acc_oob = ...

# Print acc_test and acc_oob
print('Test set accuracy: {:.3f}, OOB accuracy: {:.3f}'.format(acc_test, acc_oob))

## Feature Importance

In [None]:
import pandas as pd

In [None]:
############################################ Task 4 ############################################
# Study the importance of all features with the random forest calssifier with 20 trees and
# reorder the features according the their estimated importance.
# ----------------------------------------- start here -----------------------------------------

# Create a pd.Series of features importances
importances = pd.Series(data=..., index=wine.feature_names)

# Sort importances
importances_sorted = ...

# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='barh')
plt.title('Features Importances')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
############################################ Task 4 ############################################
# Increase the number of features starting with the most important feature and 
# report the cross validation error versus the number of sorted features
# ----------------------------------------- start here -----------------------------------------

# Create a pd.Series of features importances
importances_ = pd.Series(data=...)

# Sort importances
index_list = ...

# Create a rfc
rfc = RandomForestClassifier(n_estimators=20)
    
scores = []
slice_ = []

for i in index_list.index:
    
    slice_.append(i)
    
    # Increase features gradually
    X_slice = wine.data[:,...]
    y_slice = wine.target
    
    # Calculate the cross validaton errors
    score_cv = cross_val_score(..., ..., ..., cv=5)

    # Add cross validaton errors to a list
    scores.append(score_cv.mean())

In [None]:
# Plot cross validaton errors
plt.title("RandomForestClassifier: Varying Number of Features")
plt.plot(np.arange(13)+1, scores, label="Cross_val_score")
plt.xlabel("Number of Features")
plt.ylabel("Cross_val_score")
plt.legend()
plt.grid()
plt.show()

## Hyperparameter Tuning

<img src="compare.png" width="800"/>

In [None]:
# Import GridSearchCV and RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time

In [None]:
# Only consider data with the most important features
X_slice = wine.data[:,index_list.index[0:5]]
y_slice = wine.target

In [None]:
# Split the dataset into 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X_slice,y_slice,test_size=0.2,random_state=1)

In [None]:
# Define the dictionary 'params_rf'
params_rf = {'n_estimators':list(range(10,100,10)), 
             'max_features':['sqrt', 'log2'],
             'max_depth':list(range(10,50,10)),
             'criterion':['gini', 'entropy']}

* ### Grid search

In [None]:
############################################ Task 5 ############################################
# Perform a grid search to find the best combination of parameters.
# Train a random forest calssifier for the found best combination, estimate its generalization error and print the computation time.
# ----------------------------------------- start here -----------------------------------------

start = ...

# Instantiate GridSearchCV for random forest
gs_rfc = GridSearchCV(estimator=...,
                      param_grid=...,
                      scoring=...,
                      cv=5,
                      refit=True,
                      n_jobs=-1)

# Fit gs_rfc with training set
...

# Extract the best estimator
best_model = ...

# Predict test set labels
y_pred = ...

# Calculate the accuracy with test data
accuracy = ...

# Print the accuracy
print("Accuracy:", accuracy)

end = ...
# print the time used for GridSearchCV
print('Time for grid search: ', ...)

* ### Random search

In [None]:
############################################ Task 5 ############################################
# Perform a random search to find the best combination of parameters.
# Train a random forest calssifier for the found best combination, estimate its generalization error and print the computation time.
# ----------------------------------------- start here -----------------------------------------

start = ...

# Instantiate RandomizedSearchCV for random forest
rs_rfc = RandomizedSearchCV(estimator=...,
                            param_distributions=...,
                            scoring=...,
                            cv=5,
                            refit=True,
                            n_jobs=-1)

# Fit rs_rfc with training set
...

# Extract the best estimator
best_model = ...

# Predict test set labels
y_pred = ...

# Calculate the accuracy with test data
accuracy = ...

# Print the accuracy
print("Accuracy:", accuracy)

end = ...
# print the time used for RandomizedSearchCV
print('Time for random search: ', ...)