# Building a Classification Model for the Iris data set


In this Jupyter notebook, we will be building a classification model for the Iris data set using the random forest algorithm.

## 1. Import libraries

In [12]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import pandas as pd
import matplotlib.pyplot as plt                    # Plot library

## 2. Load the *Iris* data set

In [23]:
# Cargamos el conjunto de datos de entrada

df = pd.read_csv('Data/iris.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Data/iris.csv'

## 3. Input features
The ***iris*** data set contains 4 input features and 1 output variable (the class label).

### 3.1. Input features

In [53]:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [54]:
len(iris.feature_names)

4

### 3.2. Output features

In [55]:
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


## 4. Glimpse of the data

### 4.1. Input features

In [7]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

### 4.2. Output variable (the Class label)

In [59]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### 4.3. Assigning *input* and *output* variables
Let's assign the 4 input variables to X and the output variable (class label) to Y

In [60]:
X = iris.data
Y = iris.target

### 4.3. Let's examine the data dimension

In [61]:
X.shape

(150, 4)

In [62]:
Y.shape

(150,)

## 5. Build Classification Model using Random Forest

In [74]:
clf = RandomForestClassifier()

In [72]:
clf.fit(X, Y) #fit will create a classification model and take the input 2 variables input features and the Y class label.

RandomForestClassifier()

## 6. Feature Importance

In [68]:
print(clf.feature_importances_)

[0.10878728 0.02278143 0.4285634  0.43986788]


## 7. Make Prediction

In [28]:
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [29]:
print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))

[0]


In [32]:
print(clf.predict(X[[0]]))

[0]


In [36]:
print(clf.predict_proba(X[[-1]]))

[[0.   0.05 0.95]]


In [37]:
clf.fit(iris.data, iris.target_names[iris.target])

RandomForestClassifier()

## 8. Data split (80/20 ratio)

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [40]:
X_train.shape, Y_train.shape

((120, 4), (120,))

In [41]:
X_test.shape, Y_test.shape

((30, 4), (30,))

## 9. Rebuild the Random Forest Model

In [42]:
clf.fit(X_train, Y_train)

RandomForestClassifier()

### 9.1. Performs prediction on single sample from the data set

In [43]:
print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))

[0]


In [44]:
print(clf.predict_proba([[5.1, 3.5, 1.4, 0.2]]))

[[1. 0. 0.]]


### 9.2. Performs prediction on the test set

#### *Predicted class labels*

In [45]:
print(clf.predict(X_test))

[0 0 0 0 1 0 2 1 2 2 1 2 1 0 1 1 1 0 2 2 2 1 0 2 0 2 2 2 2 2]


#### *Actual class labels*

In [46]:
print(Y_test)

[0 0 0 0 1 0 2 1 2 2 1 2 1 0 1 1 1 0 2 2 1 1 0 2 0 2 2 2 2 2]


## 10. Model Performance

In [47]:
print(clf.score(X_test, Y_test))

0.9666666666666667
