# Read dataset

In [50]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

## Read dataset from directory
dir_data = './data/'
raw_data = os.path.join(dir_data, 'iris.data')
data = np.genfromtxt(raw_data, delimiter=",", dtype=str)
label = np.array(['sepal length in cm', 'sepal width in cm', 'petal length in cm', 'petal width in cm', 'class'])

# Put txt files into DataFrame

In [83]:
arrange_data = []
for line in data:
    arrange_data.append(line)

df = pd.DataFrame(arrange_data)
df.columns = label
df

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


# Mean

In [53]:
df=df.astype({'sepal length in cm':'float32',
              'sepal width in cm':'float32',
              'petal length in cm':'float32',
              'petal width in cm':'float32'})

df.mean()

sepal length in cm    5.843335
sepal width in cm     3.054000
petal length in cm    3.758667
petal width in cm     1.198667
dtype: float64

# Median

In [9]:
df.median()

sepal length in cm    5.80
sepal width in cm     3.00
petal length in cm    4.35
petal width in cm     1.30
dtype: float64

# Standard

In [6]:
df.std()

sepal length in cm    0.828066
sepal width in cm     0.433594
petal length in cm    1.764420
petal width in cm     0.763161
dtype: float64

# Split training data & testing data

In [162]:
iris_data = df[df.columns[:-1]]
iris_label = df['class']
train_data , test_data , train_label , test_label = train_test_split(iris_data, iris_label, test_size=0.33, stratify=iris_label)
train_data

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm
147,6.5,3.0,5.2,2.0
101,5.8,2.7,5.1,1.9
126,6.2,2.8,4.8,1.8
108,6.7,2.5,5.8,1.8
74,6.4,2.9,4.3,1.3
92,5.8,2.6,4.0,1.2
82,5.8,2.7,3.9,1.2
10,5.4,3.7,1.5,0.2
51,6.4,3.2,4.5,1.5
93,5.0,2.3,3.3,1.0


In [163]:
knn = KNeighborsClassifier()
knn.fit(train_data,train_label)
pred = knn.predict(test_data)

from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(test_label,pred))
print(classification_report(test_label,pred))

[[17  0  0]
 [ 0 15  2]
 [ 0  1 15]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        17
Iris-versicolor       0.94      0.88      0.91        17
 Iris-virginica       0.88      0.94      0.91        16

      micro avg       0.94      0.94      0.94        50
      macro avg       0.94      0.94      0.94        50
   weighted avg       0.94      0.94      0.94        50



# Confusion matrix

In [None]:
  | C1 | C2 | C3
--+---------------
C1| 15 | 0  | 0
C2| 0  | 16 | 1
C3| 1  | 1  | 15