In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

# 데이터 셋 준비

In [13]:
import seaborn as sns
iris=sns.load_dataset('iris')

In [14]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


# 모델 검증

## 1. hold-out

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(iris.iloc[:,0:4],iris["species"], test_size=0.2, random_state=42)

In [17]:
print("x_train : ",x_train.shape)
print("y_train : ",y_train.shape)
print("x_test : ",x_test.shape)
print("y_test : ",y_test.shape)

x_train :  (120, 4)
y_train :  (120,)
x_test :  (30, 4)
y_test :  (30,)


In [18]:
x_train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
22,4.6,3.6,1.0,0.2
15,5.7,4.4,1.5,0.4
65,6.7,3.1,4.4,1.4
11,4.8,3.4,1.6,0.2
42,4.4,3.2,1.3,0.2
...,...,...,...,...
71,6.1,2.8,4.0,1.3
106,4.9,2.5,4.5,1.7
14,5.8,4.0,1.2,0.2
92,5.8,2.6,4.0,1.2


In [19]:
y_train

22         setosa
15         setosa
65     versicolor
11         setosa
42         setosa
          ...    
71     versicolor
106     virginica
14         setosa
92     versicolor
102     virginica
Name: species, Length: 120, dtype: object

## 2. k-fold

In [20]:
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score

In [30]:
folds=5
kf = KFold(n_splits=folds)
n = 1
accuracy_s = []
for train_ind, test_ind in kf.split(iris):
    print(f"{n}번 째 반복")
    print("train :", iris.loc[train_ind,:].shape)
    print("test :", iris.loc[test_ind,:].shape)
    print("Setosa 비율 :", np.mean(iris.loc[train_ind,:]["species"]=="setosa"))
    
    model_rf = RandomForestClassifier(n_jobs = -1, random_state=42)
    model_rf.fit(iris.iloc[train_ind,0:4],iris.iloc[train_ind,4])
    a_score = accuracy_score(iris.iloc[test_ind,4], model_rf.predict(iris.iloc[test_ind,0:4]))
    print("정확도 : ",a_score)
    accuracy_s.append(a_score)
    
    n += 1
    
    print("\n")

1번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.16666666666666666
정확도 :  1.0


2번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.25
정확도 :  1.0


3번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.4166666666666667
정확도 :  0.8666666666666667


4번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.4166666666666667
정확도 :  0.9333333333333333


5번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.4166666666666667
정확도 :  0.7666666666666667




In [31]:
print(np.mean(accuracy_s))

0.9133333333333333


## 3. stratified k-fold

In [33]:
from sklearn.model_selection import StratifiedKFold

In [34]:
np.mean(iris["species"]=="setosa")

0.3333333333333333

In [35]:
folds=5
skf = StratifiedKFold(n_splits=folds)
n = 1
for train_ind, test_ind in skf.split(iris, iris["species"]):
    print(f"{n}번 째 반복")
    print("train :", iris.loc[train_ind,:].shape)
    print("test :", iris.loc[test_ind,:].shape)
    print("Setosa 비율 :", np.mean(iris.loc[train_ind,:]["species"]=="setosa"))
    
    model_rf = RandomForestClassifier(n_jobs = -1, random_state=42)
    model_rf.fit(iris.iloc[train_ind,0:4],iris.iloc[train_ind,4])
    a_score = accuracy_score(iris.iloc[test_ind,4], model_rf.predict(iris.iloc[test_ind,0:4]))
    print("정확도 : ",a_score)
    accuracy_s.append(a_score)
    
    n += 1
    
    print("\n")

1번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.3333333333333333
정확도 :  0.9666666666666667


2번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.3333333333333333
정확도 :  0.9666666666666667


3번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.3333333333333333
정확도 :  0.9333333333333333


4번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.3333333333333333
정확도 :  0.9666666666666667


5번 째 반복
train : (120, 5)
test : (30, 5)
Setosa 비율 : 0.3333333333333333
정확도 :  1.0




# 하이퍼 파라미터 튜닝

## 2. Grid Search

In [57]:
from sklearn.model_selection import GridSearchCV

In [59]:

params = { 'n_estimators' : [80, 100, 200, 300],
           'max_depth' : [10, 14, 16, 20],
           'min_samples_leaf' : [4, 8, 12],
           'min_samples_split' : [5, 10, 20]
            }

model_rf = RandomForestClassifier(n_jobs = -1, random_state=42)

grid_cv = GridSearchCV(model_rf, param_grid = params, cv = 5, n_jobs = -1)
grid_cv.fit(x_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 80}
최고 예측 정확도: 0.9500


## 4. Bayesian Optimization

In [60]:
from bayes_opt import BayesianOptimization

from sklearn.metrics import accuracy_score

In [68]:
rf_parameter_bounds = { 'n_estimators' : (80, 300),
           'max_depth' :  (10, 20),
           'min_samples_leaf' : (4, 12),
           'min_samples_split' :  (5, 20)
            }

def rf_bo(n_estimators, max_depth, min_samples_leaf, min_samples_split):
    bo_params = {
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'min_samples_leaf' : int(round(min_samples_leaf)),
        'min_samples_split' : int(round(min_samples_split)),
    }
    
    rf_train_BO = RandomForestClassifier(**bo_params)

    rf_train_BO.fit(x_train,y_train)
    score = accuracy_score(y_test, rf_train_BO.predict(x_test))
    return score

In [69]:
BO_rf = BayesianOptimization(f = rf_bo, pbounds = rf_parameter_bounds,random_state = 0)

BO_rf.maximize(init_points = 5, n_iter = 10)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 1.0     [0m | [0m 15.49   [0m | [0m 9.722   [0m | [0m 14.04   [0m | [0m 199.9   [0m |
| [0m 2       [0m | [0m 1.0     [0m | [0m 14.24   [0m | [0m 9.167   [0m | [0m 11.56   [0m | [0m 276.2   [0m |
| [0m 3       [0m | [0m 1.0     [0m | [0m 19.64   [0m | [0m 7.068   [0m | [0m 16.88   [0m | [0m 196.4   [0m |
| [0m 4       [0m | [0m 1.0     [0m | [0m 15.68   [0m | [0m 11.4    [0m | [0m 6.066   [0m | [0m 99.17   [0m |
| [0m 5       [0m | [0m 1.0     [0m | [0m 10.2    [0m | [0m 10.66   [0m | [0m 16.67   [0m | [0m 271.4   [0m |
| [0m 6       [0m | [0m 1.0     [0m | [0m 10.85   [0m | [0m 7.06    [0m | [0m 19.67   [0m | [0m 80.51   [0m |
| [0m 7       [0m | [0m 1.0     [0m | [0m 10.56   [0m | [0m 11.15   [0m | [0m 6.021   [0m | [0m 299.7   [0m 

In [70]:
max_params = BO_rf.max['params']
max_params

{'max_depth': 15.488135039273248,
 'min_samples_leaf': 9.721514930979357,
 'min_samples_split': 14.041450641074658,
 'n_estimators': 199.8743002593173}

In [71]:
max_params['max_depth'] = int(max_params['max_depth'])
max_params['min_samples_leaf'] = int(max_params['min_samples_leaf'])
max_params['min_samples_split'] = int(max_params['min_samples_split'])
max_params['n_estimators'] = int(max_params['n_estimators'])
print(max_params)

{'max_depth': 15, 'min_samples_leaf': 9, 'min_samples_split': 14, 'n_estimators': 199}


In [72]:
model_BO_rf_tuend = RandomForestClassifier(**max_params)