scikit-learn 梯度提升树(GBDT)调参小结

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import matplotlib.pylab as plt
%matplotlib inline

In [3]:
train = pd.read_csv('dataset/train_modified.csv')
target='Disbursed' # Disbursed的值就是二元分类的输出
IDcol = 'ID'
train['Disbursed'].value_counts() 

0    19680
1      320
Name: Disbursed, dtype: int64

In [4]:
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [5]:
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X,y)
y_pred = gbm0.predict(X)
y_predprob = gbm0.predict_proba(X)[:,1]
print("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.9852
AUC Score (Train): 0.900531


In [6]:
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
                                    min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,
                                    random_state=10), 
                        param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(X,y)
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 40}, 0.8132610041920731)

In [7]:
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(100,801,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40, 
                                                               min_samples_leaf=20, max_features='sqrt', 
                                                               subsample=0.8, random_state=10), 
                        param_grid = param_test2, scoring='roc_auc',cv=5)
gsearch2.fit(X,y)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 7, 'min_samples_split': 100}, 0.8224736407520326)

In [8]:
param_test3 = {'min_samples_split':range(800,1900,200), 'min_samples_leaf':range(60,101,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40,max_depth=7,
                                    min_samples_leaf =100, max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test3, scoring='roc_auc',cv=5)
gsearch3.fit(X,y)
gsearch3.best_params_, gsearch3.best_score_

({'min_samples_leaf': 100, 'min_samples_split': 800}, 0.8191191882621951)

In [9]:
gbm1 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40, max_depth=7, min_samples_leaf =100, 
               min_samples_split =800, max_features='sqrt', subsample=0.8, random_state=10)
gbm1.fit(X,y)
y_pred = gbm1.predict(X)
y_predprob = gbm1.predict_proba(X)[:,1]
print("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.984
AUC Score (Train): 0.893186


In [10]:
param_test4 = {'max_features':range(7,20,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40, 
                                                               max_depth=7, min_samples_leaf =100, 
                                                               min_samples_split =800, subsample=0.8, 
                                                               random_state=10), 
                        param_grid = param_test4, scoring='roc_auc', cv=5)
gsearch4.fit(X,y)
gsearch4.best_params_, gsearch4.best_score_

({'max_features': 7}, 0.8191191882621951)

In [11]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40, 
                                                               max_depth=7, min_samples_leaf =100, 
                                                               min_samples_split=800, max_features=7, 
                                                               random_state=10), 
                       param_grid = param_test5, scoring='roc_auc', cv=5)
gsearch5.fit(X,y)
gsearch5.best_params_, gsearch5.best_score_

({'subsample': 0.8}, 0.8191191882621951)

In [12]:
gbm2 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40, max_depth=7, min_samples_leaf=100, 
                                   min_samples_split=800, max_features=7, subsample=0.8, random_state=10)
gbm2.fit(X,y)
y_pred = gbm2.predict(X)
y_predprob = gbm2.predict_proba(X)[:,1]
print("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.984
AUC Score (Train): 0.893186


为了增加模型泛化能力/为防止过拟合,将步长缩小5倍，最大迭代次数增加5倍，继续拟合我们的模型

In [13]:
gbm3 = GradientBoostingClassifier(learning_rate=0.02, n_estimators=200, max_depth=7, min_samples_leaf=100, 
                                   min_samples_split=800, max_features=7, subsample=0.8, random_state=10)
gbm3.fit(X,y)
y_pred = gbm3.predict(X)
y_predprob = gbm3.predict_proba(X)[:,1]
print("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.984
AUC Score (Train): 0.901773


为了增加模型泛化能力/为防止过拟合,将步长缩小2倍，最大迭代次数增加2倍，继续拟合我们的模型

In [14]:
gbm4 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=400, max_depth=7, min_samples_leaf=100, 
                                   min_samples_split=800, max_features=7, subsample=0.8, random_state=10)
gbm4.fit(X,y)
y_pred = gbm4.predict(X)
y_predprob = gbm4.predict_proba(X)[:,1]
print("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.984
AUC Score (Train): 0.902672
