***Importing the Libraries***

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from sklearn import datasets

***Importing the Iris Data Set***

In [3]:
iris = datasets.load_iris()
df=pd.DataFrame(iris.data)
df.columns=[['SepalLength','SepalWidth','PetalLength','PetalWidth']]
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
# Adding the Species column
Species=pd.Series(iris.target)
df1=pd.concat([df,Species],axis=1)
df1.columns=['SepalLength','SepalWidth','PetalLength','PetalWidth','Species']
df1.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [55]:
np.unique(df1['Species'])
# There are three unique values in Spercies column
# 0 for setosa
# 1 for versicolor
# 2 for virginica

array([0, 1, 2])

***Defining the Problem***

In [None]:
# Here we want to use SepalLength......Species columns to predict the value of PetalWidth
# So essentially this is a classification problem
# We will be using the Random Forest algorithm to solve this problem

***Dividing the data into Test and Train***

In [6]:
pos=[i == 'PetalWidth' for i in list(df1.columns.values)]
pos

[False, False, False, True, False]

In [7]:
from operator import not_
pos1=list(map(not_, pos))
X=list(df1.columns.values[pos1])
X

['SepalLength', 'SepalWidth', 'PetalLength', 'Species']

In [8]:
Y=list(df1.columns.values[pos])
Y

['PetalWidth']

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df1[X], df1[Y], test_size=0.3, random_state=0)
 

***List of Paramters***

In [16]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
print('Parameters currently in use:\n')
rf.get_params()


Parameters currently in use:



{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

***Creation of the Grid***

In [29]:
from sklearn.model_selection import RandomizedSearchCV


In [17]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators

[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]

In [18]:
Max_features=['auto', 'sqrt']
Max_features

['auto', 'sqrt']

In [19]:
Max_depth=[int(x) for x in np.linspace(10,110,num=11)]
Max_depth

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]

In [20]:
Min_samples_split=[2,5,10]
Min_samples_split

[2, 5, 10]

In [21]:
min_samples_leaf = [1, 2, 4]
min_samples_leaf

[1, 2, 4]

In [24]:
bootstrap=[True,False]
bootstrap

[True, False]

In [25]:
# Creating a Random Grid Search
random_grid= { 
'n_estimators': n_estimators, 
'max_features': Max_features,
'max_depth': Max_depth,
'min_samples_split': Min_samples_split,
'min_samples_leaf': min_samples_leaf ,
'bootstrap':bootstrap
}

In [26]:
# Lets look at the Grid
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [27]:
# Total Combinations: 10*2*11*3*3*2 = 3960 settings
# This means that the Grid will go through 4320 combinations and look for
# Optimised setting and Mean Square Error(mse) is minimum
10*2*11*3*3*2

3960

In [30]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,n_iter = 100,
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)


In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(df1[X], df1[Y], test_size=0.3, random_state=0)
rf_random.fit(X_train, Y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [36]:
# Checking the value of parameters
rf_random.best_params_


{'n_estimators': 600,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 110,
 'bootstrap': True}

***Prediction of Test Data Set***

In [37]:
y_pred=rf_random.predict(X_test)
y_pred

array([1.83090338, 1.26697857, 0.2385806 , 1.99457471, 0.2233255 ,
       2.22625835, 0.22342174, 1.4807487 , 1.57111132, 1.28726509,
       1.92108553, 1.45940101, 1.46667549, 1.43081021, 1.46690087,
       0.22395889, 1.43231093, 1.31408725, 0.17248363, 0.27697068,
       1.78813224, 1.40170916, 0.31944064, 0.18114785, 1.76762807,
       0.2155761 , 0.35031072, 1.34419288, 1.05958792, 0.22067286,
       2.15310063, 1.40044098, 0.24080351, 1.78425275, 1.97135406,
       1.16438067, 0.35328609, 1.62555068, 1.30431986, 1.2047076 ,
       2.05328367, 0.19873926, 1.98777677, 0.22597164, 0.28670124])

***R Square***

In [76]:
from sklearn.metrics import r2_score
y_true=np.array(Y_test)
RSqr=r2_score(y_pred,y_true)
RSqr

0.9255148951210833

***Adjusted R Square***

In [82]:
n=df1.shape[0]
p=X_train.shape[1]
Adj_r2 = 1-(1-RSqr)*(n-1)/(n-p-1)
Adj_r2

0.9234601336071822

***Mean Square Error***

In [83]:
sklearn.metrics.mean_squared_error(y_true, y_pred)

0.03571883420426498