## Importing libaries 

In [90]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

## Loading iris dataset 

In [91]:
# using the datasets sub module to load the iris dataset
iris = datasets.load_iris()

In [92]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

## Feature names and target variable

In [93]:
# these are the target variables for the target names 
# 0 corresponds to setosa, 1 - versicolor and 2 - virginica 
# this is what we want to predict 
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [94]:
# these are the names of the target variables 
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [95]:
# there are 4 features 
# the features are sepal length, sepal width, petal length and petal width
# this information is important for target prediction
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

## The dataset 

In [96]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

## Assign input and output variables 

In [97]:
# the measurements for the feature variables
X = iris.data
# the values for the target variables 
Y = iris.target

## Examining the dimensions of the feature and target variables

In [98]:
# 150 rows, 150 flowers
# 4 columns that represent the 4 classes
X.shape

(150, 4)

In [99]:
# 150 rows
# 1 column that represents target variables
Y.shape

(150,)

## Let's take a look at the feature dataset and the target dataset 

In [100]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [101]:
Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Building a classification model using random forest

In [102]:
# defining the classifier variable
clf = RandomForestClassifier()

In [103]:
# the classifier is being called, in this case it is the random forest classifier 
# the fit function is used to create a classification model
# 2 variables are taken as arguments, the X variable which consists of the input features
# and the Y variable with consists of the target label
# the model is performed in a supervised manner
# it learns from the class label and outputs the classification model
clf.fit(X,Y)

RandomForestClassifier()

## Feature Importance

In [104]:
# 4 input features - sepal length, sepal width, petal length and petal width
# each of them have a corresponding importance to a different degree to the classification model 
# the values are in order for the respective input features we have
# the most important feature seems to be the last variable followed by the third one, then the first one 
# and the second one after that 
# petal width, petal length, sepal length and sepal width
print(clf.feature_importances_)

[0.09427867 0.02497759 0.42427526 0.45646847]


## Making the prediction

In [105]:
# let's feed in the first data sample, the first flower
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [106]:
# we're taking the values from the first flower as an array argument for
# the prediction function of the classifier
# 0 is the value of the first class
print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))

[0]


In [107]:
# let's see what the first class value is from the target data
# the prediction function was able to predict this value
# based on the input features from the first flower
# 0 - setosa
Y[0]

0

In [108]:
print(clf.predict([X[0]]))

[0]


In [109]:
# probability for each of the three classes is shown below
# a 100% of the probability goes to predicting the first class
# this means setosa or 0 is the correct class for this input feature
print(clf.predict_proba(X[[0]]))

[[1. 0. 0.]]


In [110]:
clf.fit(iris.data, iris.target_names[iris.target])

RandomForestClassifier()

## Data splitting (80-20 ratio)

In [111]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [112]:
X_train.shape, Y_train.shape

((120, 4), (120,))

In [113]:
X_test.shape, Y_test.shape

((30, 4), (30,))

## Rebuilding Classification Model using Random Forest

In [114]:
clf.fit(X_train, Y_train)

RandomForestClassifier()

## Performing prediction on a single sample from the dataset

In [115]:
# using the first flower as the input
print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))

[0]


In [116]:
# the probability is 100% for the first label
# 0 - setosa
# the first flower is 100% setosa
print(clf.predict_proba([[5.1, 3.5, 1.4, 0.2]]))

[[1. 0. 0.]]


## Performing prediction on the test set

In [117]:
# makes predictions for 30 flowers from the test dataset
print(clf.predict(X_test))

[1 0 1 0 2 0 1 1 2 0 2 1 1 1 0 1 2 1 0 2 2 2 1 1 2 2 0 1 0 0]


## Comparing the predictions with the actual values

In [118]:
print(Y_test)

[2 0 1 0 2 0 1 1 2 0 2 1 1 1 0 1 2 1 0 2 2 2 1 1 2 2 0 1 0 0]


## Model performance

In [119]:
print(clf.score(X_test, Y_test))

0.9666666666666667
