## iris data를 knn classifier로 분류하기
1. 예측력(정확도)가 제일 좋은 n_neighbors 찾기
2. SepalLength = 5, SepalWidth = 2.9, PetalLength = 1, PetalWidth = 0.2 인 것의 품종은?


---

### 데이터 불러오기

In [98]:
import pandas as pd

In [99]:
iris = pd.read_csv("../Data/iris.csv")

iris.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [100]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SepalLength  150 non-null    float64
 1   SepalWidth   150 non-null    float64
 2   PetalLength  150 non-null    float64
 3   PetalWidth   150 non-null    float64
 4   Name         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [101]:
iris.describe()


Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### Feature와 Target 구분


In [102]:
iris_data = iris_data = iris.iloc[:, 0:4] 
iris_target = iris['Name']

print(iris_data.head())
print('--------------------------------')
print(iris_target.head())
print('--------------------------------')
print(iris_data.shape)
print('--------------------------------')
print(iris_target.shape)


   SepalLength  SepalWidth  PetalLength  PetalWidth
0          5.1         3.5          1.4         0.2
1          4.9         3.0          1.4         0.2
2          4.7         3.2          1.3         0.2
3          4.6         3.1          1.5         0.2
4          5.0         3.6          1.4         0.2
--------------------------------
0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Name, dtype: object
--------------------------------
(150, 4)
--------------------------------
(150,)


### train과 test 데이터 구분


In [103]:
from sklearn.model_selection import train_test_split

In [104]:
train_input, test_input, train_target, test_target = train_test_split(
    iris_data,
    iris_target,
    random_state=42,
    stratify=iris_target
)

print(train_input.shape)
print('--------------------------------')
print(test_input.shape)
print('--------------------------------')
print(train_target.shape)
print('--------------------------------')
print(test_target.shape)


(112, 4)
--------------------------------
(38, 4)
--------------------------------
(112,)
--------------------------------
(38,)


### 데이터 전처리: train data로 정규화, 표준화


In [105]:
from sklearn.preprocessing import StandardScaler


In [106]:
scaler = StandardScaler()

In [107]:
train_scaled = scaler.fit_transform(train_input)
test_scaled = scaler.transform(test_input)

print(train_scaled[:5])
print('--------------------------------')
print(test_scaled[:5])




[[ 1.79213839 -0.59619012  1.31568284  0.92066139]
 [ 2.14531053 -0.59619012  1.65378566  1.05085593]
 [-0.4446185  -1.50777093 -0.03672843 -0.25108947]
 [ 0.26172578 -0.59619012  0.13232298  0.13949415]
 [-0.4446185  -1.27987572  0.13232298  0.13949415]]
--------------------------------
[[-1.73958301  0.31539068 -1.38913971 -1.29264579]
 [ 0.49717388  0.54328588  0.52677627  0.53007777]
 [-0.2091704  -0.59619012  0.18867345  0.13949415]
 [-0.4446185  -1.73566613  0.13232298  0.13949415]
 [-1.73958301 -0.14039972 -1.38913971 -1.29264579]]


### ML 실행

In [108]:
from sklearn.neighbors import KNeighborsClassifier

In [109]:
kn = KNeighborsClassifier(n_neighbors=9)

In [110]:
kn.fit(train_scaled, train_target)

### 평가

In [111]:
kn.score(test_scaled, test_target)

0.9736842105263158

### 최적의 Hyper Parameter 찾기

In [112]:
best_score = 0
best_n_neighbors = 0

for i in range(1, 75+1, 2):
    kn = KNeighborsClassifier(n_neighbors=i)
    kn.fit(train_input, train_target)
    score = kn.score(test_input, test_target)
    if score > best_score:
        best_score = score
        best_n_neighbors = i

print(f"Best n_neighbors: {best_n_neighbors} with score: {best_score}")

Best n_neighbors: 3 with score: 0.9736842105263158


###

### SepalLength = 5, SepalWidth = 2.9, PetalLength = 1, PetalWidth = 0.2 인 것의 품종은?

In [114]:
import numpy as np

In [115]:
kn.predict([[5, 2.9, 1, 0.2]])



array(['Iris-setosa'], dtype=object)