# 4-6 랜덤 포레스트

### 랜덤 포레스트란?

데이터를 기반으로 다수의 의사결정 트리를 만들고 각 트리를 기반으로 다수결로 결과를 정하여 예측, 학습 데이터를 무작위로 샘플링해서 만들어진 다수의 의사결정 트리를 사용하기 때문에 랜덤 포레스트라고 명명.

주요 Point

- 샘플링 할 때 sub-sample 사이즈는 항상 input sample 사이즈와 같음. (bootstrap sample이라고 하며 [1,2,3,4,5] -> [2,2,4,3,1]와 같이 샘플링 합니다)
- 각 특징(예를들어 버섯의 색깔, 형태. 냄새 등..)들도 sampling 하여 (보통 sqrt(N)개 한다고 합니다.) 여러개의 트리를 생성
- 각 트리별 예측을 보고 다수결에 따라 예측값 결정

![](.\1.png)

참고자료
 - scikit-learn : Randomforestclassifier http://scikit-learn.org/stable/modules/ensemble.html

 - 네이버 블로그 : http://sams.epaiai.com/220979751089

### 랜덤 포레스트 사용하기 : 버섯 분류(독버섯과 식용버섯)


- 독버섯 데이터셋 저장 : url로 부터 독버섯 데이터를 불러와 mushroom.csv로 저장합니다.

In [1]:
import urllib.request as req
local= "mushroom.csv"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
req.urlretrieve(url, local)
print("ok")

ok


- 랜덤 포레스트를 사용한 독버섯 분류 예제 1

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [56]:
# 데이터 읽어 들이기--- (※1)
mr = pd.read_csv("mushroom.csv", header=None)

In [57]:
print('mr = ', mr)

mr =       0  1  2  3  4  5  6  7  8  9  ... 13 14 15 16 17 18 19 20 21 22
0     p  x  s  n  t  p  f  c  n  k ...  s  w  w  p  w  o  p  k  s  u
1     e  x  s  y  t  a  f  c  b  k ...  s  w  w  p  w  o  p  n  n  g
2     e  b  s  w  t  l  f  c  b  n ...  s  w  w  p  w  o  p  n  n  m
3     p  x  y  w  t  p  f  c  n  n ...  s  w  w  p  w  o  p  k  s  u
4     e  x  s  g  f  n  f  w  b  k ...  s  w  w  p  w  o  e  n  a  g
5     e  x  y  y  t  a  f  c  b  n ...  s  w  w  p  w  o  p  k  n  g
6     e  b  s  w  t  a  f  c  b  g ...  s  w  w  p  w  o  p  k  n  m
7     e  b  y  w  t  l  f  c  b  n ...  s  w  w  p  w  o  p  n  s  m
8     p  x  y  w  t  p  f  c  n  p ...  s  w  w  p  w  o  p  k  v  g
9     e  b  s  y  t  a  f  c  b  g ...  s  w  w  p  w  o  p  k  s  m
10    e  x  y  y  t  l  f  c  b  g ...  s  w  w  p  w  o  p  n  n  g
11    e  x  y  y  t  a  f  c  b  n ...  s  w  w  p  w  o  p  k  s  m
12    e  b  s  y  t  a  f  c  b  w ...  s  w  w  p  w  o  p  n  s  g
13    p  x  y  w  t  p  f  c

In [5]:
# 데이터 내부의 기호를 숫자로 변환하기--- (※2)
label = []
data = []
attr_list = []
for row_index, row in mr.iterrows():
    label.append(row.loc[0])
    row_data = []
    for v in row.iloc[1:]:
        row_data.append(ord(v))  # 참고 : ord() 는 문자의 아스키 값을 반환하여 줍니다. a ~ z -> 97 ~ 122, ? -> 63
    data.append(row_data)

In [42]:
# pandas 활용해서 값 변환하기1

In [28]:
mr_edit = mr

In [33]:
mr_edit.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [20]:
mr_edit.iloc[:,0].apply(lambda x: ord(x))   # 테스트

0       112
1       101
2       101
3       112
4       101
5       101
6       101
7       101
8       112
9       101
10      101
11      101
12      101
13      112
14      101
15      101
16      101
17      112
18      112
19      112
20      101
21      112
22      101
23      101
24      101
25      112
26      101
27      101
28      101
29      101
       ... 
8094    101
8095    112
8096    101
8097    112
8098    112
8099    101
8100    101
8101    112
8102    101
8103    101
8104    101
8105    101
8106    101
8107    101
8108    112
8109    101
8110    101
8111    101
8112    101
8113    112
8114    112
8115    101
8116    112
8117    112
8118    112
8119    101
8120    101
8121    101
8122    112
8123    101
Name: 0, dtype: int64

In [37]:
len(mr_edit.columns)

23

In [39]:
for column in mr_edit.columns:
    print(column)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


In [40]:
for column in mr_edit.columns:
    mr_edit[column] = mr_edit[column].apply(lambda x: ord(x))
    

In [41]:
mr_edit.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,112,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,101,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,101,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,112,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,101,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


### 추가) pandas 활용하여 데이터 전환하기

In [43]:
# pandas 활용해서 값 변환하기2   pd.applymap / pd.apply

In [64]:
# pd.applymap(함수)  ==> 원소별 함수 적용

mr_edit2 = pd.DataFrame()
mr_edit2 = mr.applymap(lambda x: ord(x))

In [65]:
mr_edit2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,112,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,101,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,101,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,112,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,101,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


In [62]:
# pd.apply(함수) ==>  column 별 함수 적용  /   cf) pd.apply(함수, axis = 1)

mr_edit3 = pd.DataFrame()
for column in mr.columns:
    mr_edit3[column] = mr[column].apply(lambda x: ord(x))

In [63]:
mr_edit3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,112,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,101,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,101,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,112,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,101,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


In [6]:
print('label = ')
print(label)

label = 
['p', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'p', 'p', 'p', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 

In [51]:
print('data = ')
print(data)

data = 
[[120, 115, 110, 116, 112, 102, 99, 110, 107, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 115, 117], [120, 115, 121, 116, 97, 102, 99, 98, 107, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 110, 110, 103], [98, 115, 119, 116, 108, 102, 99, 98, 110, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 110, 110, 109], [120, 121, 119, 116, 112, 102, 99, 110, 110, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 115, 117], [120, 115, 103, 102, 110, 102, 119, 98, 107, 116, 101, 115, 115, 119, 119, 112, 119, 111, 101, 110, 97, 103], [120, 121, 121, 116, 97, 102, 99, 98, 110, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 107, 110, 103], [98, 115, 119, 116, 97, 102, 99, 98, 103, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 107, 110, 109], [98, 121, 119, 116, 108, 102, 99, 98, 110, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 110, 115, 109], [120, 121, 119, 116, 112, 102, 99, 110, 112, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 118, 103], [98, 115,

In [52]:
# 학습 전용과 테스트 전용 데이터로 나누기 --- (※3)
data_train, data_test, label_train, label_test = \
    train_test_split(data, label) # train , test 비율을 3:1 비율로 나누어 줍니다.

In [53]:
print('data_train의 수 : ', len(data_train))
print('data_test의 수 : ', len(data_test))

data_train의 수 :  6093
data_test의 수 :  2031


In [54]:
# 데이터 학습시키기 --- (※4)
clf = RandomForestClassifier()
clf.fit(data_train, label_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### RandomForestClassifier class
#### class sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)


- max_features : int, float, string or None, optional (default=”auto”) , If “auto”, then max_features=sqrt(n_features).
- bootstrap : boolean, optional (default=True), Whether bootstrap samples are used when building trees.

In [55]:
# 데이터 예측하기 --- (※5)
predict = clf.predict(data_test)
# 결과 테스트하기 --- (※6)
ac_score = metrics.accuracy_score(label_test, predict)
cl_report = metrics.classification_report(label_test, predict)
print("정답률 =", ac_score)
print("리포트 =\n", cl_report)

정답률 = 1.0
리포트 =
              precision    recall  f1-score   support

          e       1.00      1.00      1.00      1048
          p       1.00      1.00      1.00       983

avg / total       1.00      1.00      1.00      2031



## 데이터를 숫자로 변경할 때 주의할 사항

버섯의 색을
빨강 = 1, 파랑 = 2, 초록 = 3, 흰색 =4 와 같이 숫자를 할당하였을 때, 각각의 데이터가 상관이 없다는것을 나타내기 위해서는

트레이닝(FIT)전에 아래와 같이 데이터를 변경해주어야 합니다.

빨강 1 0 0 0
파랑 0 1 0 0
초록 0 0 1 0
흰색 0 0 0 1

이것을 "one hot" encoding 이라고 합니다. [변경된 표현방식을 보면 하나의 요소만 1로 표현('hot')되어있음]
대부분의 머신러닝에서 흔히 쓰이며 독버섯 분류 예제 모델 1에서 직접 각 특징들을 one hot encoding 해주지 않았지만 fit 이라는 내장함수 안에 one hot encoding이 포함되어 있습니다.

- 랜덤 포레스트를 사용한 독버섯 분류 예제 2 : one hot encoding 직접 해보기

In [66]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [67]:
# 데이터 읽어 들이기
mr = pd.read_csv("mushroom.csv", header=None)

In [68]:
# 데이터 내부의 분류 변수 전개하기
label = []
data = []
attr_list = []
for row_index, row in mr.iterrows():
    label.append(row.loc[0])
    exdata = []
    for col, v in enumerate(row.iloc[1:]):   # enumerate : 값과 index를 반환할 때 사용
        if row_index == 0:
            attr = {"dic": {}, "cnt":0}
            attr_list.append(attr)
        else:
            attr = attr_list[col]
        # 버섯의 특징 기호를 배열로 나타내기
        d = [0,0,0,0,0,0,0,0,0,0,0,0]    # 하나의 특징을 총 12개의 요소를 가지는 배열로 one hot encoding 합니다. (22 * 12)
        if v in attr["dic"]:
            idx = attr["dic"][v]
        else:
            idx = attr["cnt"]
            attr["dic"][v] = idx
            attr["cnt"] += 1
        d[idx] = 1
        exdata += d
    data.append(exdata)

In [69]:
print('labe = ', label)
print('data 수 = ', len(data))
print('data 중 첫번째 특징 = ', len(data[0]))
print(data[0])

labe =  ['p', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'p', 'p', 'p', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', '

In [70]:
# 학습 전용 데이터와 테스트 전용 데이터로 나누기
data_train, data_test, label_train, label_test = \
train_test_split(data, label) #3:1 로 나누어줍니다.

In [34]:
# 데이터 학습시키기
clf = RandomForestClassifier()
clf.fit(data_train, label_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [35]:
# 데이터 예측하기
predict = clf.predict(data_test)
# 결과 테스트하기
ac_score = metrics.accuracy_score(label_test, predict)
print("정답률 =", ac_score)

정답률 = 1.0


### 추가) get_dummies

In [75]:
mr_edit_dummies = pd.get_dummies(mr)

In [76]:
mr_edit_dummies

Unnamed: 0,0_e,0_p,1_b,1_c,1_f,1_k,1_s,1_x,2_f,2_g,...,21_s,21_v,21_y,22_d,22_g,22_l,22_m,22_p,22_u,22_w
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
