In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

# 데이터 셋 준비

In [2]:
import seaborn as sns
diamonds=sns.load_dataset('diamonds')

In [3]:
diamonds = diamonds[["depth","table","price","x","y","z","carat","cut"]]
diamonds

Unnamed: 0,depth,table,price,x,y,z,carat,cut
0,61.5,55.0,326,3.95,3.98,2.43,0.23,Ideal
1,59.8,61.0,326,3.89,3.84,2.31,0.21,Premium
2,56.9,65.0,327,4.05,4.07,2.31,0.23,Good
3,62.4,58.0,334,4.20,4.23,2.63,0.29,Premium
4,63.3,58.0,335,4.34,4.35,2.75,0.31,Good
...,...,...,...,...,...,...,...,...
53935,60.8,57.0,2757,5.75,5.76,3.50,0.72,Ideal
53936,63.1,55.0,2757,5.69,5.75,3.61,0.72,Good
53937,62.8,60.0,2757,5.66,5.68,3.56,0.70,Very Good
53938,61.0,58.0,2757,6.15,6.12,3.74,0.86,Premium


# 모델 검증

## 1. hold-out

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
x_train, x_test, y_train, y_test = train_test_split(diamonds.iloc[:,0:6],diamonds["cut"], test_size=0.2, random_state=42)

In [6]:
print("x_train : ",x_train.shape)
print("y_train : ",y_train.shape)
print("x_test : ",x_test.shape)
print("y_test : ",y_test.shape)

x_train :  (43152, 6)
y_train :  (43152,)
x_test :  (10788, 6)
y_test :  (10788,)


In [7]:
x_train

Unnamed: 0,depth,table,price,x,y,z
26546,58.1,64.0,16231,8.23,8.19,4.77
9159,60.0,60.0,4540,6.57,6.49,3.92
14131,62.5,58.0,5729,6.59,6.54,4.10
15757,61.5,65.0,6300,7.21,7.17,4.42
24632,62.1,57.0,12968,7.27,7.32,4.53
...,...,...,...,...,...,...
11284,62.4,59.0,4975,6.48,6.51,4.05
44732,61.0,55.0,1617,5.03,5.01,3.06
38158,60.3,58.0,1014,4.49,4.46,2.70
860,62.8,59.0,2871,6.13,6.03,3.82


In [8]:
y_train

26546         Good
9159     Very Good
14131      Premium
15757         Good
24632    Very Good
           ...    
11284    Very Good
44732        Ideal
38158    Very Good
860        Premium
15795      Premium
Name: cut, Length: 43152, dtype: category
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [9]:
from sklearn.metrics import accuracy_score

In [10]:
model_rf = RandomForestClassifier(n_jobs = -1, random_state=42)
model_rf.fit(x_train, y_train)
print(accuracy_score(model_rf.predict(x_test), y_test))

0.7839265850945495


## 2. k-fold

In [11]:
from sklearn.model_selection import KFold

In [13]:
np.mean(diamonds["cut"]=="Premium")

0.2556729699666296

In [20]:
folds=5
kf = KFold(n_splits=folds)
n = 1
acc_list = []
for train_ind, test_ind in kf.split(diamonds):
    print(f"{n}번 째 반복")
    print("train :", diamonds.loc[train_ind,:].shape)
    print("test :", diamonds.loc[test_ind,:].shape)
    print("Premium 비율 :", np.mean(diamonds.loc[train_ind,:]["cut"]=="Premium"))
    
    model_rf = RandomForestClassifier(n_jobs = -1, random_state=42)
    model_rf.fit(diamonds.iloc[train_ind,0:7],diamonds.iloc[train_ind,7])
    acc = accuracy_score(model_rf.predict(diamonds.iloc[test_ind,0:7]), diamonds.iloc[test_ind,7])

    print("accuracy_score : ",acc)
    acc_list.append(acc)
    
    
    n += 1
    
    print("\n")

1번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.2557471264367816
accuracy_score :  0.7487022617723397


2번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.24541156840934372
accuracy_score :  0.7415647015202076


3번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.24944382647385985
accuracy_score :  0.764460511679644


4번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.2600342973674453
accuracy_score :  0.8010752688172043


5번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.26772803114571747
accuracy_score :  0.7440674823878384




In [21]:
np.mean(acc_list)

0.7599740452354468

## 3. stratified k-fold

In [22]:
from sklearn.model_selection import StratifiedKFold

In [23]:
np.mean(diamonds["cut"]=="Premium")

0.2556729699666296

In [25]:
folds=5
skf = StratifiedKFold(n_splits=folds)
n = 1
acc_list = []
for train_ind, test_ind in skf.split(diamonds, diamonds["cut"]):
    print(f"{n}번 째 반복")
    print("train :", diamonds.loc[train_ind,:].shape)
    print("test :", diamonds.loc[test_ind,:].shape)
    print("Premium 비율 :", np.mean(diamonds.loc[train_ind,:]["cut"]=="Premium"))
    
    model_rf = RandomForestClassifier(n_jobs = -1, random_state=42)
    model_rf.fit(diamonds.iloc[train_ind,0:7],diamonds.iloc[train_ind,7])
    acc = accuracy_score(model_rf.predict(diamonds.iloc[test_ind,0:7]), diamonds.iloc[test_ind,7])

    print("accuracy_score : ",acc)
    acc_list.append(acc)
    
    
    n += 1
    
    print("\n")

1번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.2556776047460141
accuracy_score :  0.4746014089729329


2번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.25565443084909156
accuracy_score :  0.2388765294771969


3번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.2556776047460141
accuracy_score :  0.23053392658509456


4번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.2556776047460141
accuracy_score :  0.4033185020393029


5번 째 반복
train : (43152, 8)
test : (10788, 8)
Premium 비율 : 0.2556776047460141
accuracy_score :  0.6623099740452354




In [26]:
np.mean(acc_list)

0.40192806822395255

# 하이퍼 파라미터 튜닝

## 2. Grid Search

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
params = { 'n_estimators' : [80, 100, 200, 300], # 4가지
           'max_depth' : [10, 14, 16, 20], # 4가지
           'min_samples_leaf' : [4, 8, 12], # 3가지
           'min_samples_split' : [5, 10, 20] # 3가지
            } # 총 4*4*3*3 = 144번

model_rf = RandomForestClassifier(n_jobs = -1, random_state=42)

grid_cv = GridSearchCV(model_rf, param_grid = params, cv = 5, n_jobs = -1) #cv : k-fold
grid_cv.fit(x_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200}
최고 예측 정확도: 0.7790


## 4. Bayesian Optimization

In [29]:
from bayes_opt import BayesianOptimization

In [30]:
rf_parameter_bounds = { 'n_estimators' : (80, 300),
           'max_depth' :  (10, 20),
           'min_samples_leaf' : (4, 12),
           'min_samples_split' :  (5, 20)
            }

def rf_bo(n_estimators, max_depth, min_samples_leaf, min_samples_split):
    bo_params = {
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'min_samples_leaf' : int(round(min_samples_leaf)),
        'min_samples_split' : int(round(min_samples_split)),
    }
    
    rf_train_BO = RandomForestClassifier(**bo_params)

    rf_train_BO.fit(x_train,y_train)
    score = accuracy_score(rf_train_BO.predict(x_test),y_test)
    return score

In [31]:
BO_rf = BayesianOptimization(f = rf_bo, pbounds = rf_parameter_bounds,random_state = 0)

BO_rf.maximize(init_points = 5, n_iter = 10)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7714  [0m | [0m 15.49   [0m | [0m 9.722   [0m | [0m 14.04   [0m | [0m 199.9   [0m |
| [0m 2       [0m | [0m 0.7689  [0m | [0m 14.24   [0m | [0m 9.167   [0m | [0m 11.56   [0m | [0m 276.2   [0m |
| [95m 3       [0m | [95m 0.7777  [0m | [95m 19.64   [0m | [95m 7.068   [0m | [95m 16.88   [0m | [95m 196.4   [0m |
| [0m 4       [0m | [0m 0.7718  [0m | [0m 15.68   [0m | [0m 11.4    [0m | [0m 6.066   [0m | [0m 99.17   [0m |
| [0m 5       [0m | [0m 0.7544  [0m | [0m 10.2    [0m | [0m 10.66   [0m | [0m 16.67   [0m | [0m 271.4   [0m |
| [0m 6       [0m | [0m 0.7761  [0m | [0m 18.09   [0m | [0m 9.439   [0m | [0m 16.85   [0m | [0m 192.0   [0m |
| [95m 7       [0m | [95m 0.778   [0m | [95m 17.14   [0m | [95m 4.063   [0m | [95m 9.339   [0m | [95m 1

In [32]:
max_params = BO_rf.max['params']
max_params

{'max_depth': 19.846814545955667,
 'min_samples_leaf': 4.328079084165816,
 'min_samples_split': 5.179687345178966,
 'n_estimators': 290.50205763363783}

In [33]:
max_params['max_depth'] = int(max_params['max_depth'])
max_params['min_samples_leaf'] = int(max_params['min_samples_leaf'])
max_params['min_samples_split'] = int(max_params['min_samples_split'])
max_params['n_estimators'] = int(max_params['n_estimators'])
print(max_params)

{'max_depth': 19, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 290}


In [34]:
model_BO_rf_tuend = RandomForestClassifier(**max_params)