<a href="https://colab.research.google.com/github/look4pritam/ArtificialIntelligence/blob/master/MachineLearning/RandomForest/Notebooks/HyperparameterSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Random Forest Classifier

In this example, we will learn to classify a dataset using a [Random Forest Classifier](https://en.wikipedia.org/wiki/Random_forest).

We will use an iris dataset for classification.

See [link](https://archive.ics.uci.edu/ml/datasets/iris) for more details.

# Set the root directory for processing.

In [1]:
import os

root_dir = '/content/'
os.chdir(root_dir)

!ls -al

total 16
drwxr-xr-x 1 root root 4096 Feb 29 14:23 .
drwxr-xr-x 1 root root 4096 Mar  5 05:57 ..
drwxr-xr-x 4 root root 4096 Feb 29 14:22 .config
drwxr-xr-x 1 root root 4096 Feb 29 14:23 sample_data


# Import required python modules.

In [2]:
import numpy as np
np.random.seed(7)

In [3]:
import pandas as pd

In [4]:
from sklearn import datasets

# Load an iris dataset.

### Define feature names.

In [5]:
feature_names = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']

### Load an iris dataset from sklearn datasets.

In [6]:
iris_dataset = datasets.load_iris()

### Define input variables (X) and an output variable (y).

In [7]:
features = pd.DataFrame(iris_dataset.data, columns = feature_names)
labels = iris_dataset.target

# Create training and testing datasets.

In [8]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 7)

# Select hyper-parameters and their corrosponding range.

### Number of trees in the random forest.

In [9]:
n_estimators = [5, 20, 50, 100]

### Number of features considred at every split.

In [10]:
max_features = ['auto', 'sqrt']

### Maximum number of levels allowed in each decision tree.

In [11]:
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)]

### Minimum sample number to split a node.

In [12]:
min_samples_split = [2, 6, 10]

### Minimum sample number that can be stored in a leaf node.

In [13]:
min_samples_leaf = [1, 3, 4]

### Method used to sample data points.

In [14]:
bootstrap = [True, False]

In [15]:
random_grid = {
'n_estimators': n_estimators,
 'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap
 }

# Import required python modules.

In [16]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest based classfier.

In [17]:
model = RandomForestClassifier(random_state=7)

# Create a randomized search for the classifier.

In [18]:
from sklearn.model_selection import RandomizedSearchCV

random_model = RandomizedSearchCV(estimator=model,
                                  param_distributions=random_grid,
                                  n_iter = 100,
                                  cv=5,
                                  verbose=2,
                                  random_state=7,
                                  n_jobs=-1)

random_model.fit(train_features,train_labels)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


# Show best parameters for the classifier.

In [19]:
print('Random grid - ', random_grid)
print('Best parameters - ', random_model.best_params_)

Random grid -  {'n_estimators': [5, 20, 50, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'min_samples_split': [2, 6, 10], 'min_samples_leaf': [1, 3, 4], 'bootstrap': [True, False]}
Best parameters -  {'n_estimators': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 90, 'bootstrap': True}


# Create a random forest classifier using the best parameters.

In [20]:
model = RandomForestClassifier(n_estimators = 20, min_samples_split = 10, min_samples_leaf= 1, max_features = 'auto', max_depth= 80, bootstrap=True, random_state=7)
model.fit(train_features,train_labels)

  warn(


# Evaluate accuracy of the classfier.

In [21]:
from sklearn import metrics

predictions = model.predict(test_features)
print('Accuracy - ', metrics.accuracy_score(test_labels, predictions))

Accuracy -  0.9
