# Introduction

This notebook provides a simple introduction to implement random forest algorithm.

As a case study, we apply random forest on the iris dataset

# Load the dataset

In [2]:
from sklearn import datasets  
  
# Loading the iris plants dataset (classification)
iris = datasets.load_iris()    

In [3]:
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


In [4]:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [5]:
# dividing the datasets into two parts i.e. training datasets and test datasets
X, y = datasets.load_iris( return_X_y = True)
  
# Splitting arrays or matrices into random train and test subsets
from sklearn.model_selection import train_test_split
# i.e. 70 % training dataset and 30 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

# Random forest classifier

In [7]:
# importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier

In [8]:
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 100)  
  
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
  
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
  
# metrics are used to find accuracy or error
from sklearn import metrics  
print()
  
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL:  0.9333333333333333


# Feature Importance

In [11]:
# using the feature importance variable
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_, index = iris.feature_names).sort_values(ascending = False)
feature_imp

petal width (cm)     0.488056
petal length (cm)    0.376865
sepal length (cm)    0.107333
sepal width (cm)     0.027746
dtype: float64

# Additional work: Observing the data

In [14]:
# creating dataframe of IRIS dataset
data = pd.DataFrame({'sepallength': iris.data[:, 0], 'sepalwidth': iris.data[:, 1],
					'petallength': iris.data[:, 2], 'petalwidth': iris.data[:, 3],
					'species': iris.target})


In [15]:
data.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
