In [1]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Mushroom

In [2]:
mush = pd.read_csv("ml_day/mushroom/mushrooms.csv")
mush

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [3]:
mush.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [4]:
# ? -> NaN
mush["stalk-root"].replace({"?":np.nan}, inplace=True)

In [5]:
mush["stalk-root"].isna().sum()

2480

### 1-1) stalk-root 컬럼 결측치 채우기

In [6]:
notna_list = list(set(mush.keys()) - set(["class","stalk-root"]))
print(notna_list)

['cap-color', 'stalk-shape', 'cap-shape', 'stalk-surface-above-ring', 'stalk-color-below-ring', 'gill-color', 'population', 'spore-print-color', 'gill-size', 'odor', 'gill-spacing', 'ring-number', 'veil-type', 'cap-surface', 'habitat', 'gill-attachment', 'bruises', 'veil-color', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'ring-type']


In [7]:
notna_index = mush[mush["stalk-root"].notna()].index
na_index = mush[mush["stalk-root"].isna()].index

In [8]:
X = pd.get_dummies(mush[notna_list])
y = mush["stalk-root"]

In [9]:
train = X.iloc[notna_index, :]
train_y = y[notna_index]
test = X.iloc[na_index, :]
test_y = y[na_index]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(train, train_y, test_size=0.2, random_state=1001)

In [11]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)
model_pred = model.predict(X_test)

In [12]:
print("모델 정확도: {:.2f}%".format(accuracy_score(model_pred, y_test)*100))

모델 정확도: 100.00%


교차 검증을 해보자

In [13]:
k_list = list(range(3,20,2))
scores=[]

for i in k_list:
    score = cross_val_score(KNeighborsClassifier(n_neighbors=i), X_train, y_train)
    scores.append(score.mean())

In [14]:
scores

[1.0,
 1.0,
 1.0,
 0.9986710963455149,
 0.9984496124031008,
 0.9982281284606866,
 0.9982281284606866,
 0.9982281284606866,
 0.9982281284606866]

k값이 3,5,7일때 100%가 나왔으므로 과적합이 제일 적을 3으로 진행하자

In [15]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [16]:
test_pred = model.predict(test)

이제 예측값으로 stalk-root컬럼의 결측값을 채워주자

In [17]:
mush.loc[na_index,"stalk-root"] = test_pred

In [18]:
mush["stalk-root"].isna().sum()

0

### 1-2) class 컬럼을 타겟으로 knn 모델만들기

위에서 원핫인코딩을 진행했기 때문에 y변수에 타겟으로하는 class컬럼만 넣어주고 위에서 나눠뒀던 데이터를 바로 쓰자

In [19]:
y = mush["class"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1001)

k값 변화에따른 교차검증 시행

In [21]:
scores = []
for k in k_list:
    model = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(model, X_train, y_train, cv=10, scoring="accuracy")
    scores.append(score.mean())

In [22]:
scores

[1.0,
 1.0,
 1.0,
 0.9998239436619718,
 0.999648196737543,
 0.9994724498131141,
 0.9994724498131141,
 0.9994724498131141,
 0.9994724498131141]

In [23]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train,y_train)
pred = knn_model.predict(X_test)

In [26]:
print("정확도: {:.2f}%".format(accuracy_score(pred, y_test)*100))

정확도: 100.00%
