### RandomForest
- 집단 학습을 기반으로 고정밀 분류, 회귀, 클러스터링 등을 수행하는 모델
- 학습 전용 데이터를 기반으로 다수의 의사결정트리를 만들고 만들어진 의사결정트리를 기반으로 다수 결과를 유도하는 모델
- Ensemble(앙상블) 기법의 지표가 됨

In [670]:
import warnings
warnings.filterwarnings('ignore')

In [671]:
import pandas as pd

In [672]:
data = pd.read_csv('../Data/iris.csv')
data.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [673]:
# Feature와 Target 분리
iris_data = data.iloc[:, :4]
iris_target = data['Name']

print(iris_data.head())
print('--------------------------------')
print(iris_target.head())
print('--------------------------------')
print(iris_data.shape)
print('--------------------------------')
print(iris_target.shape)

   SepalLength  SepalWidth  PetalLength  PetalWidth
0          5.1         3.5          1.4         0.2
1          4.9         3.0          1.4         0.2
2          4.7         3.2          1.3         0.2
3          4.6         3.1          1.5         0.2
4          5.0         3.6          1.4         0.2
--------------------------------
0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Name, dtype: object
--------------------------------
(150, 4)
--------------------------------
(150,)


In [674]:
from sklearn.model_selection import train_test_split

In [675]:
# Train과 Test 분리
train_data, test_data, train_target, test_target = train_test_split(
    iris_data, 
    iris_target, 
    random_state=42,
    stratify=iris_target
)

In [676]:
# 확인
print(train_data.shape)
print(test_data.shape)
print(train_target.shape)
print(test_target.shape)

(112, 4)
(38, 4)
(112,)
(38,)


In [677]:
# Sampling 판단
print(train_data.describe())
print(test_data.describe())

       SepalLength  SepalWidth  PetalLength  PetalWidth
count   112.000000  112.000000   112.000000  112.000000
mean      5.877679    3.061607     3.765179    1.192857
std       0.853262    0.440770     1.782584    0.771533
min       4.300000    2.000000     1.100000    0.100000
25%       5.100000    2.800000     1.575000    0.300000
50%       5.800000    3.000000     4.300000    1.300000
75%       6.400000    3.300000     5.100000    1.825000
max       7.900000    4.400000     6.900000    2.500000
       SepalLength  SepalWidth  PetalLength  PetalWidth
count    38.000000   38.000000    38.000000   38.000000
mean      5.742105    3.031579     3.739474    1.215789
std       0.750363    0.416618     1.733071    0.747799
min       4.400000    2.300000     1.000000    0.200000
25%       5.125000    2.825000     1.700000    0.300000
50%       5.700000    3.000000     4.500000    1.400000
75%       6.375000    3.300000     5.075000    1.800000
max       7.300000    4.000000     6.300000    2

In [678]:
# Target 확인
print(train_target.describe())
print('--------------------------------')
print(test_target.describe())

count             112
unique              3
top       Iris-setosa
freq               38
Name: Name, dtype: object
--------------------------------
count                  38
unique                  3
top       Iris-versicolor
freq                   13
Name: Name, dtype: object


---
### RandomForest로 분류

In [679]:
from sklearn.ensemble import RandomForestClassifier

# RandomForestClassifier : 객관식
# RandomForestRegressor : 주관식

In [680]:
# 모델 생성
clf = RandomForestClassifier(n_estimators=8)    # Hyper Parameter -> n_estimators: 트리의 개수

In [681]:
# 학습시키기
clf.fit(train_data, train_target)

In [682]:
# 펼가하기
print(clf.score(train_data, train_target))
print(clf.score(test_data, test_target))

1.0
0.9473684210526315


---
### 독버섯과 관련된 데이터를 사용한 머신러닝
- 8124종류의 버섯의 특징과 독의 유무로 구성되어 있는 데이터셋
- 버섯의 특징을 기반으로 독의 유무를 판단

In [683]:
# 데이터 획득하기
import urllib.request as req
local = '../Data/mushroom.csv'
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'

req.urlretrieve(url, local)
print('OK')

OK


In [684]:
# 데이터 불러오기
mr = pd.read_csv(local, header=None)  # 컬럼이 없는 데이터는 header=None 추가
print(mr.head())
print(mr.tail())

  0  1  2  3  4  5  6  7  8  9   ... 13 14 15 16 17 18 19 20 21 22
0  p  x  s  n  t  p  f  c  n  k  ...  s  w  w  p  w  o  p  k  s  u
1  e  x  s  y  t  a  f  c  b  k  ...  s  w  w  p  w  o  p  n  n  g
2  e  b  s  w  t  l  f  c  b  n  ...  s  w  w  p  w  o  p  n  n  m
3  p  x  y  w  t  p  f  c  n  n  ...  s  w  w  p  w  o  p  k  s  u
4  e  x  s  g  f  n  f  w  b  k  ...  s  w  w  p  w  o  e  n  a  g

[5 rows x 23 columns]
     0  1  2  3  4  5  6  7  8  9   ... 13 14 15 16 17 18 19 20 21 22
8119  e  k  s  n  f  n  a  c  b  y  ...  s  o  o  p  o  o  p  b  c  l
8120  e  x  s  n  f  n  a  c  b  y  ...  s  o  o  p  n  o  p  b  v  l
8121  e  f  s  n  f  n  a  c  b  n  ...  s  o  o  p  o  o  p  b  c  l
8122  p  k  y  n  f  y  f  c  n  b  ...  k  w  w  p  w  o  e  w  v  l
8123  e  x  s  n  f  n  a  c  b  y  ...  s  o  o  p  o  o  p  o  c  l

[5 rows x 23 columns]


In [685]:
mr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8124 non-null   object
 1   1       8124 non-null   object
 2   2       8124 non-null   object
 3   3       8124 non-null   object
 4   4       8124 non-null   object
 5   5       8124 non-null   object
 6   6       8124 non-null   object
 7   7       8124 non-null   object
 8   8       8124 non-null   object
 9   9       8124 non-null   object
 10  10      8124 non-null   object
 11  11      8124 non-null   object
 12  12      8124 non-null   object
 13  13      8124 non-null   object
 14  14      8124 non-null   object
 15  15      8124 non-null   object
 16  16      8124 non-null   object
 17  17      8124 non-null   object
 18  18      8124 non-null   object
 19  19      8124 non-null   object
 20  20      8124 non-null   object
 21  21      8124 non-null   object
 22  22      8124 non-null   

In [686]:
mr.shape    # 8124개의 데이터, 23개의 컬럼

(8124, 23)

- 한 줄이 버섯 한 개의 정보
- 첫번째 열: p(독버섯), e(식용버섯) --> Target
- 두번째 열: 버섯의 머리 모양 => b(종모양), c(원뿔), x(볼록), f(평평), k(혹), s(오목)
- 네번째 열: 버섯의 머리 색깔

> Feature Data를 숫자화 시킨다.

In [687]:
# 연습
# 알파벳 -> ASCII 코드로 변환
print(ord('x'))
# ASCII 코드 -> 알파벳으로 변환
print(chr(120))

120
x


In [688]:
# 연습: mr의 1번 컬럼을 숫자로 변경하기
for i in mr.iloc[:, 1]:
    print(i, ord(i))

x 120
x 120
b 98
x 120
x 120
x 120
b 98
b 98
x 120
b 98
x 120
x 120
b 98
x 120
x 120
s 115
f 102
x 120
x 120
x 120
b 98
x 120
b 98
b 98
b 98
f 102
x 120
x 120
f 102
x 120
b 98
x 120
x 120
x 120
b 98
x 120
s 115
x 120
x 120
b 98
b 98
x 120
x 120
x 120
x 120
x 120
x 120
x 120
x 120
f 102
x 120
x 120
b 98
x 120
x 120
b 98
f 102
b 98
x 120
x 120
s 115
b 98
b 98
b 98
b 98
f 102
x 120
f 102
x 120
x 120
f 102
b 98
f 102
x 120
b 98
f 102
x 120
f 102
x 120
f 102
x 120
x 120
f 102
x 120
x 120
x 120
b 98
x 120
f 102
s 115
x 120
b 98
x 120
x 120
x 120
x 120
f 102
x 120
b 98
x 120
x 120
b 98
f 102
x 120
b 98
x 120
x 120
b 98
b 98
x 120
x 120
s 115
x 120
x 120
x 120
x 120
s 115
x 120
x 120
s 115
x 120
x 120
f 102
f 102
x 120
x 120
b 98
f 102
x 120
b 98
b 98
b 98
f 102
x 120
f 102
x 120
f 102
x 120
x 120
b 98
x 120
b 98
s 115
f 102
x 120
x 120
f 102
x 120
b 98
b 98
x 120
x 120
x 120
s 115
x 120
x 120
b 98
x 120
b 98
b 98
b 98
b 98
x 120
f 102
x 120
f 102
b 98
b 98
x 120
b 98
x 120
b 98
x 120
b 98
f 1

In [689]:
# 적용
mr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [690]:
target = []
data = []

for row_index, row in mr.iterrows(): # iterrows(): 한 줄씩 읽어오기
    # print(row_index, ":", row)
    # print(row.loc[0])
    # print('--------------------------------')
    target.append(row.loc[0])   # 첫번째 컬럼을 Target으로 지정(독버섯(p), 식용버섯(e))
    row_data = []
    for v in row.loc[1:]:
        row_data.append(ord(v))
    data.append(row_data)

print(len(target))
print(len(data))
print(target[0])
print(data[0])

8124
8124
p
[120, 115, 110, 116, 112, 102, 99, 110, 107, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 115, 117]


### DataFrame 만들기

In [691]:
# Target을 DataFrame으로 변환
targetTemp = pd.DataFrame(target)
targetTemp.head()


Unnamed: 0,0
0,p
1,e
2,e
3,p
4,e


In [692]:
# data를 DataFrame으로 변환하고 컬럼명을 겹치지 않게 변경
dataTemp = pd.DataFrame(data)
dataTemp.rename(columns=lambda x: x+1, inplace=True) # lambda x: x+1: 컬럼명을 1, 2, 3, ... 으로 변경, inplace=True: 원본 데이터 변경
dataTemp.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,120,115,110,116,112,102,99,110,107,101,...,115,119,119,112,119,111,112,107,115,117
1,120,115,121,116,97,102,99,98,107,101,...,115,119,119,112,119,111,112,110,110,103
2,98,115,119,116,108,102,99,98,110,101,...,115,119,119,112,119,111,112,110,110,109
3,120,121,119,116,112,102,99,110,110,101,...,115,119,119,112,119,111,112,107,115,117
4,120,115,103,102,110,102,119,98,107,116,...,115,119,119,112,119,111,101,110,97,103


In [693]:
# 2개의 DataFrame 합치기
mr2 = pd.concat([targetTemp, dataTemp], axis='columns') # axis='columns': 컬럼 기준으로 합치기
mr2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,e,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,e,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,p,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,e,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


In [694]:
# 확인
mr2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8124 non-null   object
 1   1       8124 non-null   int64 
 2   2       8124 non-null   int64 
 3   3       8124 non-null   int64 
 4   4       8124 non-null   int64 
 5   5       8124 non-null   int64 
 6   6       8124 non-null   int64 
 7   7       8124 non-null   int64 
 8   8       8124 non-null   int64 
 9   9       8124 non-null   int64 
 10  10      8124 non-null   int64 
 11  11      8124 non-null   int64 
 12  12      8124 non-null   int64 
 13  13      8124 non-null   int64 
 14  14      8124 non-null   int64 
 15  15      8124 non-null   int64 
 16  16      8124 non-null   int64 
 17  17      8124 non-null   int64 
 18  18      8124 non-null   int64 
 19  19      8124 non-null   int64 
 20  20      8124 non-null   int64 
 21  21      8124 non-null   int64 
 22  22      8124 non-null   

### RandomForest로 실시

In [695]:
from sklearn.model_selection import train_test_split

In [696]:
target = mr2.iloc[:, 0]
data = mr2.iloc[:, 1:]

In [697]:
train_data, test_data, train_target, test_target = train_test_split(
    data,
    target,
    random_state=42,
    stratify=target
)

In [698]:
clf = RandomForestClassifier()


In [699]:
clf.fit(train_data, train_target)

In [700]:
print(clf.score(train_data, train_target))
print(clf.score(test_data, test_target))

1.0
1.0


---
### One-hot Encoding
- 숫자 데이터의 숫자로서 의미가 있으면 상관이 없지만, 위의 데이터는 분류를 위한 데이터이므리 숫자 크기가 의미가 없다.
- 이 때 사용하는 것이 One-hot Encoding이다.



In [701]:
# # 연습: 1번열의 data 종류
# # sorted(mr2[1].unique())
pd.get_dummies(data=dataTemp, columns=[1], prefix='1')  |# get_dummies: 컬럼 1의 데이터를 원핫인코딩 적용

SyntaxError: invalid syntax (4031544806.py, line 3)

In [None]:
# 적용하기: 컬럼 1~22
for i in list(range(1, 22+1)):
    dataTemp = pd.get_dummies(data=dataTemp, columns=[i], prefix=str(i))

dataTemp.head()

Unnamed: 0,1_98,1_99,1_102,1_107,1_115,1_120,2_102,2_103,2_115,2_121,...,21_115,21_118,21_121,22_100,22_103,22_108,22_109,22_112,22_117,22_119
0,False,False,False,False,False,True,False,False,True,False,...,True,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
2,True,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,True,False,False,False,True,...,True,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False


In [540]:
# dataFrame 만들기
mr3 = pd.concat([targetTemp, dataTemp], axis='columns')
mr3.head()

# row: cpu / col: gpu
# column이 많을 수록 정확도가 높아짐
# column을 생성하는 걸 Feature Engineering이라고 함

Unnamed: 0,0,1_98,1_99,1_102,1_107,1_115,1_120,2_102,2_103,2_115,...,21_115,21_118,21_121,22_100,22_103,22_108,22_109,22_112,22_117,22_119
0,p,False,False,False,False,False,True,False,False,True,...,True,False,False,False,False,False,False,False,True,False
1,e,False,False,False,False,False,True,False,False,True,...,False,False,False,False,True,False,False,False,False,False
2,e,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
3,p,False,False,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,True,False
4,e,False,False,False,False,False,True,False,False,True,...,False,False,False,False,True,False,False,False,False,False


In [541]:
mr3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Columns: 118 entries, 0 to 22_119
dtypes: bool(117), object(1)
memory usage: 991.8+ KB


In [665]:
target = mr3.iloc[:, 0]
data = mr3.iloc[:, 1:]

print(target.head())
print(data.head())

0    p
1    e
2    e
3    p
4    e
Name: 0, dtype: object
    1_98   1_99  1_102  1_107  1_115  1_120  2_102  2_103  2_115  2_121  ...  \
0  False  False  False  False  False   True  False  False   True  False  ...   
1  False  False  False  False  False   True  False  False   True  False  ...   
2   True  False  False  False  False  False  False  False   True  False  ...   
3  False  False  False  False  False   True  False  False  False   True  ...   
4  False  False  False  False  False   True  False  False   True  False  ...   

   21_115  21_118  21_121  22_100  22_103  22_108  22_109  22_112  22_117  \
0    True   False   False   False   False   False   False   False    True   
1   False   False   False   False    True   False   False   False   False   
2   False   False   False   False   False   False    True   False   False   
3    True   False   False   False   False   False   False   False    True   
4   False   False   False   False    True   False   False   False   False   

In [667]:
train_data, test_data, train_target, test_target = train_test_split(
    data,
    target,
    random_state=42,
    stratify=target
)

In [None]:
clf = RandomForestClassifier(n_estimators=10)

clf.fit(train_data, train_target)

In [None]:
print(clf.score(train_data, train_target))
print(clf.score(test_data, test_target))

1.0
1.0


> 이렇게 해야 정상