In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pylab as plt
%matplotlib inline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.datasets import make_classification
# X为样本特征，y为样本类别输出， 共10000个样本，每个样本20个特征，输出有2个类别，没有冗余特征，每个类别一个簇
X, y = make_classification(n_samples=10000, n_features=20, n_redundant=0,
                           n_clusters_per_class=1, n_classes=2, flip_y=0.1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7500, 20)
(7500,)
(2500, 20)
(2500,)


In [4]:
xgbClf = xgb.XGBClassifier(max_depth=5,learning_rate= 0.5, verbosity=1, eval_metric='error', 
                           use_label_encoder=False, objective='binary:logistic',random_state=1)
xgbClf.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.09680
[1]	validation_0-error:0.09360
[2]	validation_0-error:0.09120
[3]	validation_0-error:0.09240
[4]	validation_0-error:0.09160
[5]	validation_0-error:0.09360
[6]	validation_0-error:0.09480
[7]	validation_0-error:0.09520
[8]	validation_0-error:0.09520
[9]	validation_0-error:0.09520
[10]	validation_0-error:0.09440
[11]	validation_0-error:0.09520


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='error',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.5, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=1)

<h1>使用sklearn网格搜索调参</h1>

一般固定步长，先调好框架参数n_estimators，再调弱学习器参数max_depth，min_child_weight,gamma等，接着调正则化相关参数subsample，colsample_byXXX, reg_alpha以及reg_lambda,最后固定前面调好的参数，来调步长learning_rate

In [5]:
gsCv = GridSearchCV(xgbClf, {'max_depth': [4,5,6], 'n_estimators': [5,10,20]})
gsCv.fit(X_train,y_train)
print(gsCv.best_score_)
print(gsCv.best_params_)

0.9166666666666667
{'max_depth': 4, 'n_estimators': 5}


In [6]:
xgbClf2 = xgb.XGBClassifier(max_depth=4, n_estimators=5, verbosity=1, objective='binary:logistic', 
                            eval_metric='error', use_label_encoder=False, random_state=1)
gsCv2 = GridSearchCV(xgbClf2, {'learning_rate': [0.3,0.5,0.7]})
gsCv2.fit(X_train,y_train)
print(gsCv2.best_score_)
print(gsCv2.best_params_)

0.9176
{'learning_rate': 0.7}


In [7]:
xgbClf3 = xgb.XGBClassifier(max_depth=4, learning_rate=0.7, verbosity=1, objective='binary:logistic', 
                            eval_metric='error', n_estimators=5, use_label_encoder=False)
xgbClf3.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.09200
[1]	validation_0-error:0.09120
[2]	validation_0-error:0.09200
[3]	validation_0-error:0.09040
[4]	validation_0-error:0.08960


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='error',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.7, max_delta_step=0,
              max_depth=4, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=5, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=1)

In [8]:
from sklearn.metrics import accuracy_score
pred_test_new = xgbClf3.predict(X_test)
print(accuracy_score(y_test, pred_test_new))

0.9104
