# 트리의 앙상블

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-3.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

## 랜덤포레스트 (Random forest)

In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rc('font', family='NanumGothic')
plt.rc('axes', unicode_minus=False)

fpath = "../dataset"

In [None]:

wine = pd.read_csv('https://bit.ly/wine-date')

wine.head()


data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

train_input, test_input, train_target, test_target = train_test_split(data, target, 
                                                                      test_size=0.2, random_state=42)



In [6]:
# import inspect
# inspect.getfile(RandomForestClassifier)

'C:\\Users\\nbumk\\anaconda3\\envs\\mytf\\lib\\site-packages\\sklearn\\ensemble\\_forest.py'

In [4]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

# n_estimators=100 trees
# random_state : int, RandomState instance or None, default=None
# Controls both the randomness of the bootstrapping of the samples and the sampling of the features
    
rf = RandomForestClassifier(n_estimators=100, 
                            criterion='gini',                            
                            n_jobs=-1, 
                            random_state=42)
# RandomForestClassifier(
#     n_estimators=100,
#     *,
#     criterion='gini',
#     max_depth=None,
#     min_samples_split=2,
#     min_samples_leaf=1,
#     min_weight_fraction_leaf=0.0,
#     max_features='auto',
#     max_leaf_nodes=None,
#     min_impurity_decrease=0.0,
#     min_impurity_split=None,
#     bootstrap=True,
#     oob_score=False, # Whether to use out-of-bag samples to estimate
#     n_jobs=None, # parallel processing
#     random_state=None,
#     verbose=0,
#     warm_start=False, # reuse the solution of the previous call to fit
#     class_weight=None,
#     ccp_alpha=0.0,
#     max_samples=None,
# )

## Cross validation
## return_train_score 훈련셋에서의 score를 반환

scores = cross_validate(rf, train_input, 
                        train_target, 
                        return_train_score=True, 
                        n_jobs=-1)

print(scores.keys())
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

dict_keys(['fit_time', 'score_time', 'test_score', 'train_score'])
0.9973541965122431 0.8905151032797809


In [9]:
rf.fit(train_input, train_target)

print(rf.feature_importances_)
print(wine.columns)

[0.23167441 0.50039841 0.26792718]
Index(['alcohol', 'sugar', 'pH', 'class'], dtype='object')


In [14]:
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)

rf.fit(train_input, train_target)
print(rf.oob_score_)

0.8934000384837406


## Gradient Boost

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

## learning_rate=0.1, max_depth=3, n_estimators=100
gb = GradientBoostingClassifier(random_state=42)
scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.8881086892152563 0.8720430147331015


In [16]:
## learning_rate=0.1, max_depth=3, n_estimators=100

gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.2, random_state=42)
scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.9464595437171814 0.8780082549788999


In [17]:
gb.fit(train_input, train_target)
print(gb.feature_importances_)

[0.15872278 0.68010884 0.16116839]


## 히스토그램 기반 부스팅

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(random_state=42)
scores = cross_validate(hgb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.9321723946453317 0.8801241948619236


In [None]:
hgb.fit(train_input, train_target)
print(rf.feature_importances_)

[0.23167441 0.50039841 0.26792718]


In [None]:
hgb.score(test_input, test_target)

0.8723076923076923

#### XGBoost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(tree_method='hist', random_state=42)
scores = cross_validate(xgb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.8827690284750664 0.8708899089361072


#### LightGBM

In [None]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(random_state=42)
scores = cross_validate(lgb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.9338079582727165 0.8789710890649293
