In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pylab as plt
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.datasets.samples_generator import make_classification
# X为样本特征，y为样本类别输出， 共10000个样本，每个样本20个特征，输出有2个类别，没有冗余特征，每个类别一个簇
X, y = make_classification(n_samples=10000, n_features=20, n_redundant=0,
                             n_clusters_per_class=1, n_classes=2, flip_y=0.1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

(7500, 20)
(7500,)
(2500, 20)
(2500,)


<h1>XGBoost 使用原生API</h1>

In [4]:
dtrain = xgb.DMatrix(X_train,y_train)
dtest = xgb.DMatrix(X_test,y_test)
param = {'max_depth':5, 'eta':0.5, 'verbosity':1, 'objective':'binary:logistic'}
raw_model = xgb.train(param, dtrain, num_boost_round=20)

In [5]:
from sklearn.metrics import accuracy_score
pred_train_raw = raw_model.predict(dtrain)
for i in range(len(pred_train_raw)):
    if pred_train_raw[i] > 0.5:
         pred_train_raw[i]=1
    else:
        pred_train_raw[i]=0               
print (accuracy_score(dtrain.get_label(), pred_train_raw))

0.9664


In [6]:
pred_test_raw = raw_model.predict(dtest)
for i in range(len(pred_test_raw)):
    if pred_test_raw[i] > 0.5:
         pred_test_raw[i]=1
    else:
        pred_test_raw[i]=0               
print (accuracy_score(dtest.get_label(), pred_test_raw))

0.9408


<h1>XGBoost 使用sklearn wrapper，仍然使用原始API的参数</h1>

In [7]:
sklearn_model_raw = xgb.XGBClassifier(**param)
sklearn_model_raw.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="error",
        eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.0636
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.062
[2]	validation_0-error:0.0624
[3]	validation_0-error:0.062
[4]	validation_0-error:0.062
[5]	validation_0-error:0.062
[6]	validation_0-error:0.062
[7]	validation_0-error:0.062
[8]	validation_0-error:0.062
[9]	validation_0-error:0.0608
[10]	validation_0-error:0.0608
[11]	validation_0-error:0.0608
[12]	validation_0-error:0.0608
[13]	validation_0-error:0.0604
[14]	validation_0-error:0.0604
[15]	validation_0-error:0.0604
[16]	validation_0-error:0.0604
[17]	validation_0-error:0.0604
[18]	validation_0-error:0.0608
[19]	validation_0-error:0.0608
[20]	validation_0-error:0.06
[21]	validation_0-error:0.06
[22]	validation_0-error:0.06
[23]	validation_0-error:0.0592
[24]	validation_0-error:0.0588
[25]	validation_0-error:0.0588
[26]	validation_0-error:0.0584
[27]	validation_0-error:0.0584
[28]	validation_0-error:0.0584
[29]	validation_0-error:0.0584
[30]	validation_0-error:0.0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, eta=0.5, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

<h1>XGBoost 使用sklearn wrapper，使用sklearn风格的参数(推荐)</h1>

In [8]:
sklearn_model_new = xgb.XGBClassifier(max_depth=5,learning_rate= 0.5, verbosity=1, objective='binary:logistic',random_state=1)

In [9]:
sklearn_model_new.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="error",
        eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.0636
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.0624
[2]	validation_0-error:0.0604
[3]	validation_0-error:0.0592
[4]	validation_0-error:0.0592
[5]	validation_0-error:0.0584
[6]	validation_0-error:0.058
[7]	validation_0-error:0.0588
[8]	validation_0-error:0.0588
[9]	validation_0-error:0.0588
[10]	validation_0-error:0.0588
[11]	validation_0-error:0.0588
[12]	validation_0-error:0.0588
[13]	validation_0-error:0.058
[14]	validation_0-error:0.0584
[15]	validation_0-error:0.0584
[16]	validation_0-error:0.0584
Stopping. Best iteration:
[6]	validation_0-error:0.058



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.5,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=1, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

<h1>使用sklearn网格搜索调参</h1>

一般固定步长，先调好框架参数n_estimators，再调弱学习器参数max_depth，min_child_weight,gamma等，接着调正则化相关参数subsample，colsample_byXXX, reg_alpha以及reg_lambda,最后固定前面调好的参数，来调步长learning_rate

In [10]:
gsCv = GridSearchCV(sklearn_model_new,
                   {'max_depth': [4,5,6],
                    'n_estimators': [5,10,20]})
gsCv.fit(X_train,y_train)





GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.5,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=1, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [4, 5, 6], 'n_estimators': [5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
print(gsCv.best_score_)
print(gsCv.best_params_)

0.9533333333333334
{'max_depth': 4, 'n_estimators': 10}


In [12]:
sklearn_model_new2 = xgb.XGBClassifier(max_depth=4,n_estimators=10,verbosity=1, objective='binary:logistic',random_state=1)
gsCv2 = GridSearchCV(sklearn_model_new2, 
                   {'learning_rate ': [0.3,0.5,0.7]})
gsCv2.fit(X_train,y_train)



GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=10, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=1, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate ': [0.3, 0.5, 0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
print(gsCv2.best_score_)
print(gsCv2.best_params_)

0.9516
{'learning_rate ': 0.3}


In [14]:
sklearn_model_new2 = xgb.XGBClassifier(max_depth=4,learning_rate= 0.3, verbosity=1, objective='binary:logistic',n_estimators=10)
sklearn_model_new2.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="error",
        eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.062
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.0592
[2]	validation_0-error:0.0608
[3]	validation_0-error:0.0608
[4]	validation_0-error:0.0608
[5]	validation_0-error:0.0604
[6]	validation_0-error:0.0592
[7]	validation_0-error:0.0588
[8]	validation_0-error:0.0588
[9]	validation_0-error:0.0588


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.3,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=10, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [15]:
pred_test_new = sklearn_model_new2.predict(X_test)
print (accuracy_score(dtest.get_label(), pred_test_new))

0.9412


