# Random Forest and XGBoost

### Imports

In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')
from time import time

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

### Loading digit dataset and splitting in train and test

In [2]:
X, y = datasets.load_digits(return_X_y = True)

X_train, X_test, y_train, y_true = tts(X,y,test_size = 0.2, random_state = 42)

### Random Forest Classifier and its parameters

In [29]:
randomforestclf = RandomForestClassifier()
param_grid_rf = [
              {'bootstrap': [True, False],
               'max_depth': [1,2,3],
               'n_estimators': [10, 50, 100, 200, 500],
               'max_features': [1,2,3,4,5,6,7,8]}
              ]
              
randomforestclf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### XGBoost classifier and its parameter grid

In [10]:
xgbclf = xgb.XGBClassifier()
param_grid_xgb = [
                  { 'learning_rate': [0.1, 0.2, 1],
                    'max_depth': [1,2,3],
                    'n_estimators': [10, 50, 100, 200, 500] }
                  ]

## Grid Search CV

### Calculating time for hyperparameter search on Random Forest Classifier using Grid Search CV

In [5]:
start = time()
gridsearch = GridSearchCV(estimator = randomforestclf, param_grid=param_grid_rf, scoring='f1_macro')
gridsearch.fit(X_train, y_train)
end = time()
print(f"Time taken for Random Forest Classifier using Grid Search: {end - start}")

Time taken for Random Forest Classifier using Grid Search: 346.6907925605774


### Displaying Best Parameters for Random Forest using Grid Search CV and means and standard deviation for all the possibilities of parameter combinations

In [9]:
print("Best parameters set found on development set:")
print()
print(gridsearch.best_params_)
print()
print("Grid scores on development set:")
print()
means = gridsearch.cv_results_['mean_test_score']
stds = gridsearch.cv_results_['std_test_score']
fit_times = gridsearch.cv_results_['mean_fit_time']
for mean, std, fit_time, params in zip(means, stds, fit_times, gridsearch.cv_results_['params']):
    print("%0.3f (+/-%0.03f), %0.3f seconds for %r"
          % (mean, std * 2, fit_time, params))
print()

Best parameters set found on development set:

{'bootstrap': False, 'max_depth': 3, 'max_features': 3, 'n_estimators': 200}

Grid scores on development set:

0.416 (+/-0.147), 0.018 seconds for {'bootstrap': True, 'max_depth': 1, 'max_features': 1, 'n_estimators': 10}
0.619 (+/-0.028), 0.075 seconds for {'bootstrap': True, 'max_depth': 1, 'max_features': 1, 'n_estimators': 50}
0.702 (+/-0.063), 0.147 seconds for {'bootstrap': True, 'max_depth': 1, 'max_features': 1, 'n_estimators': 100}
0.718 (+/-0.081), 0.290 seconds for {'bootstrap': True, 'max_depth': 1, 'max_features': 1, 'n_estimators': 200}
0.730 (+/-0.069), 0.736 seconds for {'bootstrap': True, 'max_depth': 1, 'max_features': 1, 'n_estimators': 500}
0.416 (+/-0.135), 0.016 seconds for {'bootstrap': True, 'max_depth': 1, 'max_features': 2, 'n_estimators': 10}
0.707 (+/-0.052), 0.078 seconds for {'bootstrap': True, 'max_depth': 1, 'max_features': 2, 'n_estimators': 50}
0.688 (+/-0.081), 0.150 seconds for {'bootstrap': True, 'max_d

### Calculating time for hyperparameter search on XGBoost Classifier using Grid Search CV

In [13]:
start = time()
gridsearch = GridSearchCV(estimator = xgbclf, param_grid=param_grid_xgb, scoring='f1_macro')
gridsearch.fit(X_train, y_train)
end = time()
print(f"Time taken for XGBoost Classifier using Grid Search: {end - start}")

Time taken for XGBoost Classifier using Grid Search: 448.2015211582184


### Displaying Best Parameters for XGBoost using Grid Search CV and means and standard deviation for all the possibilities of parameter combinations

In [14]:
print("Best parameters set found on development set for XGBoost:")
print()
print(gridsearch.best_params_)
print()
print("Grid scores on development set:")
print()
means = gridsearch.cv_results_['mean_test_score']
stds = gridsearch.cv_results_['std_test_score']
fit_times = gridsearch.cv_results_['mean_fit_time']
for mean, std, fit_time, params in zip(means, stds, fit_times, gridsearch.cv_results_['params']):
    print("%0.3f (+/-%0.03f), %0.3f seconds for %r"
          % (mean, std * 2, fit_time, params))
print()

Best parameters set found on development set for XGBoost:

{'learning_rate': 1, 'max_depth': 1, 'n_estimators': 200}

Grid scores on development set:

0.706 (+/-0.012), 0.105 seconds for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 10}
0.869 (+/-0.027), 0.448 seconds for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 50}
0.909 (+/-0.028), 0.874 seconds for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 100}
0.940 (+/-0.026), 1.777 seconds for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 200}
0.955 (+/-0.021), 4.364 seconds for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 500}
0.833 (+/-0.038), 0.153 seconds for {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 10}
0.925 (+/-0.027), 0.698 seconds for {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 50}
0.950 (+/-0.015), 1.381 seconds for {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}
0.953 (+/-0.018), 2.702 seconds for {'learning_rate': 0.1, 'max_depth': 2, 'n_estimat

## Randomized Search CV

### Calculating time for hyperparameter search on Random Forest Classifier using Randomized Search CV

In [28]:
n_iter = 20
start = time()
randomsearch = RandomizedSearchCV(estimator=randomforestclf, param_distributions=param_grid_rf, scoring="f1_macro", n_iter=n_iter)
randomsearch.fit(X_train,y_train)
end = time()
print(f"Time taken for Randomized Search CV : {end - start}")

Time taken for Randomized Search CV : 25.610801696777344


### Displaying Best Parameters for Random Forest using Randomized Search CV and means, standard deviations and fitting time for all the possibilities of parameter combinations

In [22]:
print("Best parameters set found on development set:")
print()
print(randomsearch.best_params_)
print()
print("Randomized scores on development set:")
print()
means = randomsearch.cv_results_['mean_test_score']
stds = randomsearch.cv_results_['std_test_score']
fit_times = randomsearch.cv_results_['mean_fit_time']
for mean, std, fit_time, params in zip(means, stds, fit_times, randomsearch.cv_results_['params']):
    print("%0.3f (+/-%0.03f), %0.3f seconds for %r"
          % (mean, std * 2, fit_time, params))
print()
print("Classification Report")
print()
y_pred = randomsearch.predict(X_test)
print(classification_report(y_true, y_pred))
print()

Best parameters set found on development set:

{'n_estimators': 500, 'max_features': 5, 'max_depth': 3, 'bootstrap': True}

Randomized scores on development set:

0.724 (+/-0.047), 0.313 seconds for {'n_estimators': 200, 'max_features': 4, 'max_depth': 1, 'bootstrap': True}
0.844 (+/-0.047), 0.090 seconds for {'n_estimators': 50, 'max_features': 5, 'max_depth': 3, 'bootstrap': True}
0.506 (+/-0.210), 0.013 seconds for {'n_estimators': 10, 'max_features': 4, 'max_depth': 1, 'bootstrap': False}
0.867 (+/-0.018), 0.181 seconds for {'n_estimators': 100, 'max_features': 8, 'max_depth': 3, 'bootstrap': False}
0.821 (+/-0.031), 0.221 seconds for {'n_estimators': 200, 'max_features': 1, 'max_depth': 2, 'bootstrap': False}
0.612 (+/-0.058), 0.066 seconds for {'n_estimators': 50, 'max_features': 8, 'max_depth': 1, 'bootstrap': False}
0.888 (+/-0.026), 0.347 seconds for {'n_estimators': 200, 'max_features': 4, 'max_depth': 3, 'bootstrap': True}
0.666 (+/-0.062), 0.325 seconds for {'n_estimators':

### Calculating time for hyperparameter search on XGBoost Classifier using Randomized Search CV

In [26]:
n_iter = 20
start = time()
randomsearch = RandomizedSearchCV(estimator=xgbclf, param_distributions=param_grid_xgb, scoring="f1_macro", n_iter=n_iter)
randomsearch.fit(X_train,y_train)
end = time()
print(f"Time taken for Randomized Search CV : {end - start}")

Time taken for Randomized Search CV : 230.935227394104


### Displaying Best Parameters for XGBoost Classifier using Randomized Search CV and means, standard deviations and fitting time for all the possibilities of parameter combinations

In [27]:
print("Best parameters set found on development set for XGBoost Classifier:")
print()
print(randomsearch.best_params_)
print()
print("Randomized scores on development set:")
print()
means = randomsearch.cv_results_['mean_test_score']
stds = randomsearch.cv_results_['std_test_score']
fit_times = randomsearch.cv_results_['mean_fit_time']
for mean, std, fit_time, params in zip(means, stds, fit_times, randomsearch.cv_results_['params']):
    print("%0.3f (+/-%0.03f), %0.3f seconds for %r"
          % (mean, std * 2, fit_time, params))
print()
print("Classification Report")
print()
y_pred = randomsearch.predict(X_test)
print(classification_report(y_true, y_pred))
print()

Best parameters set found on development set for XGBoost Classifier:

{'n_estimators': 500, 'max_depth': 2, 'learning_rate': 0.2}

Randomized scores on development set:

0.946 (+/-0.018), 0.613 seconds for {'n_estimators': 50, 'max_depth': 2, 'learning_rate': 1}
0.957 (+/-0.024), 5.898 seconds for {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.2}
0.958 (+/-0.034), 0.865 seconds for {'n_estimators': 100, 'max_depth': 1, 'learning_rate': 1}
0.954 (+/-0.021), 1.351 seconds for {'n_estimators': 100, 'max_depth': 2, 'learning_rate': 0.2}
0.949 (+/-0.027), 4.751 seconds for {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 1}
0.948 (+/-0.020), 1.149 seconds for {'n_estimators': 100, 'max_depth': 2, 'learning_rate': 1}
0.907 (+/-0.021), 0.213 seconds for {'n_estimators': 10, 'max_depth': 3, 'learning_rate': 0.2}
0.953 (+/-0.018), 2.741 seconds for {'n_estimators': 200, 'max_depth': 2, 'learning_rate': 0.1}
0.940 (+/-0.026), 1.760 seconds for {'n_estimators': 200, 'max_depth': 1

## Observations:
* Hyperparameter search for Random Forest using Randomized Search CV will take less time than Grid Search as we can control the number of iterations
* However, for XGBoost, Randomized Search takes 10x more time compared to Random Forest with same number of iterations (20)
* Using Randomized Search CV, we can different set of best parameters for different number of iterations
* GridSearch CV takes more time for XGBoost compared to Random Forest. Also, if large number of iterations are used, Random Search takes almost same time as Grid Search CV for both the classifiers
* After getting means, standard deviations and fitting times for all the parameters based on f1_macro scoring, we select the parameters with maximum mean, minimum std dev and minimum fitting time as best params