# 랜덤포레스트 
    : 앙상블기법 사용한다. 랜덤포레스트의 출발점은 ctree(의사결정나무), ctree는 과대적합이 자주 나온다. 해결하기위해 ctree를 많이 수행하는 것에서 랜덤포레스트 탄생.        
    앙상블 기법 : 기존에 있는 머신러닝 기법으로 새로운 머신러닝 기법을 만들어 낸 것     

In [158]:
import pandas as pd

In [159]:
# 붓꽃 데이터 불러오기
csv = pd.read_csv('../Data/iris.csv')

In [160]:
csv.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [161]:
# 필요한 열 추출하기
csv_data = csv[['SepalLength','SepalWidth','PetalLength','PetalWidth']]
csv_label = csv['Name']

In [162]:
# 학습 전용 데이터와 테스트 전용 데이터로 나누기
from sklearn.model_selection import train_test_split
train_data, test_data, train_label, test_label = train_test_split(csv_data , csv_label)

In [163]:
train_data.shape
test_data.shape

import collections # 빈도수 세기
counts = collections.Counter(test_label)
print(counts)

Counter({'Iris-setosa': 14, 'Iris-versicolor': 12, 'Iris-virginica': 12})


In [164]:
# 데이터 학습 시키고 예측하기
from sklearn.ensemble import RandomForestClassifier # regression / classification 나눠져 있다
clf = RandomForestClassifier()

In [165]:
clf.fit(train_data , train_label)

RandomForestClassifier()

In [166]:
pred = clf.predict(test_data)

In [167]:
# 정확도
clf.score(test_data, test_label)

0.9473684210526315

In [168]:
# Confusion Matrix
from sklearn import metrics
metrics.confusion_matrix(test_label , pred)

array([[14,  0,  0],
       [ 0, 10,  2],
       [ 0,  0, 12]])

In [169]:
cl_report = metrics.classification_report(test_label , pred)
print(cl_report)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        14
Iris-versicolor       1.00      0.83      0.91        12
 Iris-virginica       0.86      1.00      0.92        12

       accuracy                           0.95        38
      macro avg       0.95      0.94      0.94        38
   weighted avg       0.95      0.95      0.95        38



---
# 독버섯과 관련된 데이터를 사용한 머신러닝

In [170]:
# 데이터 획득하기
import urllib.request as request

# URL과 저장 경로 지정하기 
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
saveName = "../Data/mushroom.csv"

# 다운로드
request.urlretrieve(url, saveName)
print('OK!')

OK!


In [171]:
mr = pd.read_csv('../Data/mushroom.csv' , header = None)
mr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


# Data 정제하기

In [172]:
# 연습하기 
print(ord('a'))
print(chr(97))

97
a


In [173]:
# 데이터 내부의 문자를 숫자로 변환하기
label = []
data = []

for row_index, row in mr.iterrows():
    # print(row_index , row)    
    # print(row.loc[1:])
    label.append(row.loc[0])
    row_data = []
    for v in row.loc[1:]:
        row_data.append(ord(v))
    data.append(row_data)
print(data[0:20])


[[120, 115, 110, 116, 112, 102, 99, 110, 107, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 115, 117], [120, 115, 121, 116, 97, 102, 99, 98, 107, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 110, 110, 103], [98, 115, 119, 116, 108, 102, 99, 98, 110, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 110, 110, 109], [120, 121, 119, 116, 112, 102, 99, 110, 110, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 115, 117], [120, 115, 103, 102, 110, 102, 119, 98, 107, 116, 101, 115, 115, 119, 119, 112, 119, 111, 101, 110, 97, 103], [120, 121, 121, 116, 97, 102, 99, 98, 110, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 107, 110, 103], [98, 115, 119, 116, 97, 102, 99, 98, 103, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 107, 110, 109], [98, 121, 119, 116, 108, 102, 99, 98, 110, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 110, 115, 109], [120, 121, 119, 116, 112, 102, 99, 110, 112, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 118, 103], [98, 115, 121, 11

In [174]:
# label 을 DataFrame으로 만들기
labelTemp = pd.DataFrame(label)
labelTemp.head()

Unnamed: 0,0
0,p
1,e
2,e
3,p
4,e


In [175]:
# data를 DataFrame으로 만들고 Column Name을 겹치지 않게 하기
dataTemp = pd.DataFrame(data)
dataTemp.rename(columns= lambda x : x+1, inplace= True)
dataTemp.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,120,115,110,116,112,102,99,110,107,101,...,115,119,119,112,119,111,112,107,115,117
1,120,115,121,116,97,102,99,98,107,101,...,115,119,119,112,119,111,112,110,110,103
2,98,115,119,116,108,102,99,98,110,101,...,115,119,119,112,119,111,112,110,110,109
3,120,121,119,116,112,102,99,110,110,101,...,115,119,119,112,119,111,112,107,115,117
4,120,115,103,102,110,102,119,98,107,116,...,115,119,119,112,119,111,101,110,97,103


In [176]:
# 2개의 DataFrame을 합치기
mr2 = pd.concat([labelTemp , dataTemp] , axis= 1) # axis = 'columns'
mr2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,e,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,e,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,p,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,e,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


In [177]:
# RandomForest 로 정확도 측정해 보기

feature = mr2.iloc[:,1:]
target = mr2.iloc[:,0]

# 학습 전용 데이터와 테스트 전용 데이터로 나누기
train_data, test_data, train_label, test_label = train_test_split(feature, target)
# train_data, test_data, train_label, test_label = train_test_split(data, label)

# 갯수 확인
counts = collections.Counter(test_label)
print(counts)

Counter({'e': 1070, 'p': 961})


In [178]:
# 학습
clf.fit(train_data , train_label)

# 정확도
clf.score(test_data , test_label)

1.0

In [179]:
# 혼동행렬 report
pred = clf.predict(test_data)
cl_report = metrics.classification_report(test_label , pred)
print(cl_report)

              precision    recall  f1-score   support

           e       1.00      1.00      1.00      1070
           p       1.00      1.00      1.00       961

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031



### One Hot Encoding 실시
: 숫자 데이터가 숫자로서 의미가 있으면 상관 없지만 위의 데이터는 분류를 위한 데이터이므로 숫자 크기가 의미가 없다.  
이때 사용하는 것의 One Hot Encoding이다

In [180]:
# 1번열 data의 종류 확인
mr2[1].unique() # 6가지 있음

array([120,  98, 115, 102, 107,  99])

In [181]:
# 연습하기 
pd.get_dummies(data = dataTemp , columns=[1] , prefix='1') # prefix : 새로운 컬럼에 뭐라고 시작하게 쓸거냐


Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,19,20,21,22,1_98,1_99,1_102,1_107,1_115,1_120
0,115,110,116,112,102,99,110,107,101,101,...,112,107,115,117,0,0,0,0,0,1
1,115,121,116,97,102,99,98,107,101,99,...,112,110,110,103,0,0,0,0,0,1
2,115,119,116,108,102,99,98,110,101,99,...,112,110,110,109,1,0,0,0,0,0
3,121,119,116,112,102,99,110,110,101,101,...,112,107,115,117,0,0,0,0,0,1
4,115,103,102,110,102,119,98,107,116,101,...,101,110,97,103,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,115,110,102,110,97,99,98,121,101,63,...,112,98,99,108,0,0,0,1,0,0
8120,115,110,102,110,97,99,98,121,101,63,...,112,98,118,108,0,0,0,0,0,1
8121,115,110,102,110,97,99,98,110,101,63,...,112,98,99,108,0,0,1,0,0,0
8122,121,110,102,121,102,99,110,98,116,63,...,101,119,118,108,0,0,0,1,0,0


In [182]:
# for 문으로 
mrOH = pd.DataFrame()
for i in range(1,23):
    temp = pd.get_dummies(data = dataTemp , columns=[i] , prefix=str(i)).iloc[:,21:]
    mrOH = pd.concat([mrOH, temp], axis=1)

mrOH.head()

Unnamed: 0,1_98,1_99,1_102,1_107,1_115,1_120,2_102,2_103,2_115,2_121,...,21_115,21_118,21_121,22_100,22_103,22_108,22_109,22_112,22_117,22_119
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [183]:
# RandomForest 로 정확도 측정해 보기

# 학습 전용 데이터와 테스트 전용 데이터로 나누기
train_data, test_data, train_label, test_label = train_test_split(mrOH, label)

# 갯수 확인
counts = collections.Counter(test_label)
print(counts)

Counter({'e': 1068, 'p': 963})


In [184]:
# 학습
clf.fit(train_data , train_label)

# 정확도
clf.score(test_data , test_label)

1.0

In [185]:
# 혼동행렬 report
pred = clf.predict(test_data)
cl_report = metrics.classification_report(test_label , pred)
print(cl_report)

              precision    recall  f1-score   support

           e       1.00      1.00      1.00      1068
           p       1.00      1.00      1.00       963

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031



---