## 목표

- 버섯의 특징(특성)을 활용해서 독/식용 버섯 분류
- Decision Tree 모델 시각화 & 과대적합 제어
- 특성 중요도를 뽑아보자 : 특성 선택할 수 있는 방법 중 하나

In [1]:
# 판다스 소환
import pandas as pd
# 랜덤 샘플링 (7:3 나누기)
from sklearn.model_selection import train_test_split
# tree model
from sklearn.tree import DecisionTreeClassifier 

1. 데이터 로딩
2. 전체 컬럼, 행 크기 파악
3. 결측치 확인
4. 기술통계 -> 범주형 데이터 : 개수, 최빈값 등등..
5. 문제와 답 X,y
6. 정답 label안에 독버섯과 식용 버섯의 비율 확인
7. 범주형 -> 수치화(<원핫인코딩>, 레이블 인코딩)
8. train, test 분리
9. 모델링

###  데이터 로드

In [2]:
# 버섯 데이터 불러와주세요# 버섯 데이터 불러와주세요
data = pd.read_csv('data/mushroom.csv')
# head() 첫행부터 5개의 행만 확인
data.head(15)

Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


## 데이터 크기, 속성 확인

In [3]:
data.info()
# 결측치가 있는 컬럼 x 채워줄 고민이 필요 없다.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   poisonous                 8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [4]:
#데이터 기술 통계 확인
data.describe()
# 최빈값 = 가장 많이 관측되는 값 (freq)  (top) unique (분류되는 값이 몇개?)


Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


# X, y 문제, 답 분리

In [5]:
# 전체 데이터 data 문제 인덱싱
X = data.iloc[:, 1:]
X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,s,o,o,p,o,o,p,b,c,l
8120,x,s,n,f,n,a,c,b,y,e,...,s,o,o,p,n,o,p,b,v,l
8121,f,s,n,f,n,a,c,b,n,e,...,s,o,o,p,o,o,p,b,c,l
8122,k,y,n,f,y,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,l


In [6]:
# 전체 데이터 data 답 인덱싱
y= data.loc[:, 'poisonous']
y

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: poisonous, Length: 8124, dtype: object

In [7]:
print(X.shape)
print(y.shape)

(8124, 22)
(8124,)


In [8]:
# label 정답의 비율을 살펴보자
# value_counts()
y.value_counts()
# e 식용, p 독버섯

e    4208
p    3916
Name: poisonous, dtype: int64

###  원핫 인코딩 : 범주형(이산형) 특성을 수치화

- 내가 가진 카테고리 값에 비례하여 컬럼이 증가한다.

In [9]:
X_one_hot = pd.get_dummies(X)
X_one_hot.head()

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [10]:
# 라벨인코딩 = 레이블 인코딩
X['habitat'].unique()

array(['u', 'g', 'm', 'd', 'p', 'w', 'l'], dtype=object)

In [11]:
habitat_dic = {
    'u':2,
    'g':1,
    'm':3,
    'd':4,
    'p':5,
    'w':7,
    'l':6    
}

In [12]:
X['habitat'].map(habitat_dic)
# 순차적인 랭킹이 필요할 때

0       2
1       1
2       3
3       2
4       1
       ..
8119    6
8120    6
8121    6
8122    6
8123    6
Name: habitat, Length: 8124, dtype: int64

- 원핫 인코딩
순서가 없을 때 (예, 국가명, 거리계산) 그리고 고유값의 개수가 많지 않으면 효율적 (할당되는 컬럼이 많아지면 공간을 많이 차지해서 비효율적임)

- 레이블 인코딩
순서의 의미가 있을 때 (예, 유치원 -> 초등학교 -> 중학교 -> 고등학교 -> 대학교 / 평사원 -> 대리 -> 과장 -> 팀장)
고유값의 개수가 많을 때?

##  모델링

X_train, X_test, y_train, y_test

In [13]:
# 7:3, random_state : 65

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X_one_hot, y, test_size= 0.3, random_state=65)

In [15]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5686, 117)
(5686,)
(2438, 117)
(2438,)


### 모델 객체 생성, 학습, 예측, 정확도

In [16]:
tree_model = DecisionTreeClassifier(max_depth = 5)

In [17]:
 tree_model.fit(X_train, y_train) # 학습하기

DecisionTreeClassifier(max_depth=5)

In [18]:
# 예측하기
pre =tree_model.predict(X_test)

In [19]:
from sklearn import metrics


In [20]:
metrics.accuracy_score(y_test, pre)

0.9995898277276456

In [21]:
# train 정확도도 한번 뽑아보자
pre = tree_model.predict(X_train)

In [22]:
metrics.accuracy_score(y_train, pre)

0.9996482588814632

In [23]:
# 매개변수의 값을 조정해볼 필요가 있다.
# 미지의 데이터가 들어와도 활용 가능한지가 중요하니까.

In [24]:
# 교차검증 cross_val_score -> titanic

## 과대적합제어
만약 과대적합일 확률이 높은 경우, 그 모델은 일반화 되지 않은 모델이라고 판단
,일반화를 시켜주기 위해 사용

In [25]:
# tree 모델 생성, max_depth = 1 
tree_model2 = DecisionTreeClassifier(max_depth = 1)

In [26]:
# fit
tree_model2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=1)

In [27]:
# 예측
pre2 =tree_model2.predict(X_test)

In [28]:
# 정확도 확인
metrics.accuracy_score(y_test, pre2)

0.8859721082854799

In [29]:
#  trian 정확도 확인
pre_train = tree_model2.predict(X_train)

In [30]:
metrics.accuracy_score(pre_train, y_train)

0.887091100949701

In [31]:
# metrics 불러오지 않아도 사용 가능
# 모델 자체에 내장 score
# model.score(테스트문제, 정답)
tree_model2.score(X_test,y_test)

0.8859721082854799

## 특성선택
-tree 모델의 특성 중요도

In [32]:
X_train

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
478,0,0,0,0,0,1,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0
3991,0,0,0,0,0,1,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
7947,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
2202,0,0,1,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3215,0,0,1,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773,0,0,0,0,0,1,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
296,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
575,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
2165,0,0,0,0,0,1,1,0,0,0,...,0,0,1,1,0,0,0,0,0,0


In [33]:
fi = tree_model.feature_importances_

In [34]:
import_df = pd.DataFrame(fi, index = X_train.columns)
import_df.sort_values(by=0,ascending = False)

Unnamed: 0,0
odor_n,0.618496
stalk-root_c,0.179217
stalk-root_r,0.082643
spore-print-color_r,0.033528
odor_l,0.023113
...,...
gill-color_e,0.000000
gill-color_b,0.000000
gill-size_b,0.000000
gill-spacing_w,0.000000
