# Predicting the type of flower

In [2]:
# Importing Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# ML libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [3]:
iris = pd.read_csv('http://bit.ly/IrisDataset')
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
iris['species'].value_counts()

Iris-virginica     50
Iris-versicolor    50
Iris-setosa        50
Name: species, dtype: int64

In [7]:
# Change the labels to numerical values

# Change the column to category datatype

iris['species'] = iris['species'].astype('category')
iris.dtypes

sepal_length     float64
sepal_width      float64
petal_length     float64
petal_width      float64
species         category
dtype: object

In [8]:
iris['species_no'] = iris['species'].cat.codes
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_no
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,2
146,6.3,2.5,5.0,1.9,Iris-virginica,2
147,6.5,3.0,5.2,2.0,Iris-virginica,2
148,6.2,3.4,5.4,2.3,Iris-virginica,2


### Final Encoding values

->  Iris-setosa = 0

->  Iris-versicolor =  1

->  Iris-virginica = 2

In [10]:
# Missing values
iris.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
species_no      0
dtype: int64

In [11]:
# Duplicates
iris.duplicated().sum()

# Duplicates might help enforcing predictions

3

# Logistic regression

In [15]:
X = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].values
y = iris['species_no'].values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int8)

In [18]:
# Training sets and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

In [19]:
logistic_regressor = LogisticRegression()
logistic_regressor.fit(X_train, y_train)

LogisticRegression()

In [20]:
y_pred = logistic_regressor.predict(X_test)

In [21]:
# Comparing predicted values to OG

df_comp = pd.DataFrame({'OG' : y_test, 'Predictions' : y_pred})
df_comp

Unnamed: 0,OG,Predictions
0,1,2
1,0,0
2,2,2
3,0,0
4,1,1
5,2,2
6,1,1
7,2,2
8,0,0
9,2,2


In [24]:
# Model evaluation
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, y_pred)
confusion

array([[10,  0,  0],
       [ 0,  9,  1],
       [ 0,  0, 10]])

In our confusion matrix, we are told we have 10, 9 and 10 values that are correct

Only one value is wrong, so our model is reliable