In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(r'S:\L2S\CODE\iris_dataset\iris.csv')
print(df.head())
print(df['species'].unique())

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [4]:
from sklearn.model_selection import train_test_split

target = 'species'
X = df.drop(target, axis=1)
y = df[target]

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2)

**Model: K-Nearest Neighbor**

In [5]:
from sklearn.neighbors import KNeighborsClassifier

# define
model_knn = KNeighborsClassifier(n_neighbors=3, metric='manhattan')

# fit/train
model_knn.fit(X_train, y_train)

# predict / evaluate
model_knn.score(X_test, y_test)

0.9166666666666666

In [6]:
# cross validation
from sklearn.model_selection import GridSearchCV

# define
model_knn2 = KNeighborsClassifier()
params = {'n_neighbors': range(1,9),
          'metric': ['euclidean', 'minkowski', 'manhattan']}
grid = GridSearchCV(model_knn2, param_grid = params, cv=7)

# fit/train
grid.fit(X_train, y_train)

# score
print(grid.score(X_test, y_test))

# find the best parameter
print(grid.best_params_)
print(grid.best_estimator_)

0.9
{'metric': 'euclidean', 'n_neighbors': 3}
KNeighborsClassifier(metric='euclidean', n_neighbors=3)


In [7]:
pd.DataFrame(grid.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002287,0.003617,0.003428,0.003959,euclidean,1,"{'metric': 'euclidean', 'n_neighbors': 1}",1.0,0.8,1.0,0.75,1.0,1.0,0.75,0.9,0.116496,20
1,0.001143,0.0028,0.002285,0.003614,euclidean,2,"{'metric': 'euclidean', 'n_neighbors': 2}",1.0,0.8,1.0,0.75,1.0,0.75,1.0,0.9,0.116496,20
2,0.0,0.0,0.002286,0.003614,euclidean,3,"{'metric': 'euclidean', 'n_neighbors': 3}",1.0,0.8,1.0,0.75,1.0,1.0,1.0,0.935714,0.102519,1
3,0.0,0.0,0.002217,0.003508,euclidean,4,"{'metric': 'euclidean', 'n_neighbors': 4}",1.0,0.8,1.0,0.75,1.0,1.0,1.0,0.935714,0.102519,1
4,0.001223,0.002787,0.002487,0.003951,euclidean,5,"{'metric': 'euclidean', 'n_neighbors': 5}",1.0,0.8,1.0,0.75,1.0,1.0,1.0,0.935714,0.102519,1


**Model: Support Vector Machine**

In [8]:
from sklearn.svm import SVC

# define
model_svm = SVC(kernel='rbf', decision_function_shape='ovo')

# fit/train
model_svm.fit(X_train, y_train)

# predict
model_svm.score(X_test, y_test)

0.8833333333333333

In [9]:
# grid search

grid_svm = SVC()
params2 = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'decision_function_shape': ['ovo', 'ovr']
}
grid_svm = GridSearchCV(grid_svm, param_grid=params2, cv=10)

# fit/train
grid_svm.fit(X_train, y_train)

# predict
print(grid_svm.score(X_test, y_test))

# the best parameter
print(grid_svm.best_params_)

0.9333333333333333
{'decision_function_shape': 'ovo', 'kernel': 'linear'}




**Model: Neural Network**

In [28]:
from sklearn.neural_network import MLPClassifier

# define
model_mlp = MLPClassifier(hidden_layer_sizes=(50, 25, 10), activation = 'tanh', max_iter = 10000)

# fit/train
model_mlp.fit(X_train, y_train)

# score
model_mlp.score(X_test, y_test)

0.9333333333333333

In [22]:
# predict
pd.DataFrame(model_mlp.predict(X_test)).head()

Unnamed: 0,0
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-versicolor


In [23]:
# predict probability (proba)
pd.DataFrame(model_mlp.predict_proba(X_test)).head()

Unnamed: 0,0,1,2
0,0.98181,0.017725,0.000465
1,0.982877,0.016675,0.000448
2,0.982233,0.017307,0.000459
3,0.981661,0.017866,0.000474
4,0.007749,0.988825,0.003425


**Evaluation Metrics**

In [29]:
y_svm = model_svm.predict(X_test)
y_knn = model_knn.predict(X_test)
y_mlp = model_mlp.predict(X_test)

In [30]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_svm)

array([[43,  0,  0],
       [ 0, 36,  1],
       [ 0, 13, 27]], dtype=int64)

In [31]:
confusion_matrix(y_test, y_knn)

array([[43,  0,  0],
       [ 0, 36,  1],
       [ 0,  9, 31]], dtype=int64)

In [32]:
confusion_matrix(y_test, y_mlp)

array([[43,  0,  0],
       [ 0, 36,  1],
       [ 0,  7, 33]], dtype=int64)

In [48]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print(accuracy_score(y_test, y_svm))
print(precision_score(y_test, y_svm, average=None))
print(recall_score(y_test, y_svm, average=None))
print(f1_score(y_test, y_svm, average=None))

0.8833333333333333
[1.         0.73469388 0.96428571]
[1.         0.97297297 0.675     ]
[1.         0.8372093  0.79411765]


In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_knn))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        43
Iris-versicolor       0.80      0.97      0.88        37
 Iris-virginica       0.97      0.78      0.86        40

       accuracy                           0.92       120
      macro avg       0.92      0.92      0.91       120
   weighted avg       0.93      0.92      0.92       120

