In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
cancer = pd.read_csv('data/cancer.csv')
boston = pd.read_csv('data/boston.csv')

In [5]:
y = cancer['type']
X = cancer.values[:, 1:].astype('float32')

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
def 모델평가(모델, X_train, X_test, y_train, y_test):
    모델.fit(X_train, y_train)
    훈련점수 = 모델.score(X_train, y_train)
    시험점수 = 모델.score(X_test, y_test)
    return 훈련점수, 시험점수

eval_model = 모델평가

In [8]:
gbrt_clf = GradientBoostingClassifier()
eval_model(gbrt_clf, X_train, X_test, y_train, y_test)

(1.0, 0.90909090909090906)

In [11]:
gbrt_clf = GradientBoostingClassifier(max_depth=1)
eval_model(gbrt_clf, X_train, X_test, y_train, y_test)

(0.99295774647887325, 0.93006993006993011)

In [12]:
gbrt_clf = GradientBoostingClassifier(learning_rate=0.01)
eval_model(gbrt_clf, X_train, X_test, y_train, y_test)

(0.99061032863849763, 0.90209790209790208)

In [13]:
gbrt_clf = GradientBoostingClassifier(learning_rate=0.001)
eval_model(gbrt_clf, X_train, X_test, y_train, y_test)

(0.64084507042253525, 0.58741258741258739)

In [14]:
gbrt_clf = GradientBoostingClassifier(learning_rate=1.)
eval_model(gbrt_clf, X_train, X_test, y_train, y_test)

(1.0, 0.93006993006993011)

In [15]:
gbrt_clf = GradientBoostingClassifier(learning_rate=10.)
eval_model(gbrt_clf, X_train, X_test, y_train, y_test)

(0.91314553990610325, 0.87412587412587417)

그리드 탐색을 활용한 모델 튜닝

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
param_grid = 변수_조합 = {
    'max_depth': [1, 2, 3, 4, 5],
    'learning_rate': [0.01, 0.1, 1., 10.]
}

In [18]:
grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(),
    param_grid=param_grid
)

In [20]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5], 'learning_rate': [0.01, 0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
grid_search.best_params_

{'learning_rate': 0.01, 'max_depth': 3}

In [22]:
grid_search.best_score_

0.960093896713615

In [24]:
best_model = grid_search.best_estimator_

In [25]:
eval_model(best_model, X_train, X_test, y_train, y_test)

(0.99061032863849763, 0.90209790209790208)

## 교차 검증

In [26]:
from sklearn.model_selection import cross_val_score

In [27]:
iris = pd.read_csv('data/iris.data', header=None)

In [30]:
y = iris[4]
X = iris.values[:, :4].astype('float32')

In [28]:
gbrt_clf = GradientBoostingClassifier()

In [33]:
scores = cross_val_score(estimator=gbrt_clf, X=X, y=y)
scores

array([ 0.98039216,  0.92156863,  1.        ])