In [1]:
import xgboost as xgb
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [31]:
#generate random data with 5 features
X,y = datasets.make_regression( n_samples=22000, n_features=5, noise=7, bias=2, random_state=42)
y = y**2

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

#Create and fit, no parameter tuning
regressor = xgb.XGBRegressor(learning_rate=0.01, n_estimators=1000)
regressor.fit(X_train,y_train)

print("Regression Score: {}".format(regressor.score(X_test, y_test)))

Regression Score: 0.7993420364006706


## Tune for max_depth and min_child_weight

In [4]:
#parameter optimization with GridSearchCV
from sklearn.model_selection import GridSearchCV


#Tune for max_depth and min_child_weight
model = xgb.XGBRegressor()
clf = GridSearchCV(model, {'max_depth': [2,4,6,8],
                    'min_child_weight': range(1,6,2)}, verbose=1)
clf.fit(X,y)
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   38.5s finished


Best Score: 0.9681491043391203
Best Params: {'max_depth': 8, 'min_child_weight': 5}


In [6]:
clf = GridSearchCV(model, {'max_depth': [6,8,10,12],
                    'min_child_weight': [4,5,6]}, verbose=1)
clf.fit(X,y)
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  1.2min finished


Best Score: 0.9689127196193154
Best Params: {'max_depth': 12, 'min_child_weight': 5}


In [7]:
clf = GridSearchCV(model, {'max_depth': [12,14,16],
                    'min_child_weight': [5,6]}, verbose=1)
clf.fit(X,y)
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   55.3s finished


Best Score: 0.9689127196193154
Best Params: {'max_depth': 12, 'min_child_weight': 5}


## Tune for Gamma

In [10]:
#lock in the 12 and 5
model = xgb.XGBRegressor(max_depth=12, min_child_weight=5)

clf = GridSearchCV(model, {'gamma': [i/10.0 for i in range(0,5)]}, verbose=1)
clf.fit(X,y)
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   41.8s finished


Best Score: 0.9689127196193154
Best Params: {'gamma': 0.0}


## Tune subsample and colsample_bytree

In [11]:
#lock in the zero from before
model = xgb.XGBRegressor(max_depth=12, min_child_weight=5, gamma=0.0 )

clf = GridSearchCV(model, {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
                          }, verbose=1)
clf.fit(X,y)
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  1.6min finished


Best Score: 0.9567146215777724
Best Params: {'colsample_bytree': 0.8, 'subsample': 0.6}


In [14]:
clf = GridSearchCV(model, {
    'subsample':[i/10.0 for i in range(0,11)],
    'colsample_bytree':[0.8]
                          }, verbose=1)
clf.fit(X,y)
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

Fitting 3 folds for each of 11 candidates, totalling 33 fits


[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:   57.3s finished


Best Score: 0.9623377427906697
Best Params: {'colsample_bytree': 0.8, 'subsample': 0.2}


In [15]:
#zoom in around .2 and .8
clf = GridSearchCV(model, {
    'subsample':[i/100.0 for i in range(15,25,5)],
    'colsample_bytree':[i/100.0 for i in range(75,90,5)]
                          }, verbose=1)
clf.fit(X,y)
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   23.2s finished


Best Score: 0.9623377427906697
Best Params: {'colsample_bytree': 0.8, 'subsample': 0.2}


In [20]:
clf = GridSearchCV(model, {
    'subsample':[0.2,1],
    'colsample_bytree':[0.8,1]
                          }, verbose=1)
clf.fit(X,y)
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   23.1s finished


Best Score: 0.9698820217728594
Best Params: {'colsample_bytree': 1, 'subsample': 0.2}


## Tune Regularization

In [22]:
#lock in the 1 and 0.2 from before
model = xgb.XGBRegressor(max_depth=12, min_child_weight=5, gamma=0.0, colsample_bytree=1, subsample=0.2)

clf = GridSearchCV(model, {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
                          }, verbose=1)
clf.fit(X,y)
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   22.0s finished


Best Score: 0.970178707074787
Best Params: {'reg_alpha': 100}


## Reduce learning rate and increase estimators

In [30]:
#lock in the 100 from before
regressor = xgb.XGBRegressor(learning_rate=0.01, n_estimators=1000, max_depth=12, min_child_weight=5, gamma=0.0, colsample_bytree=1, subsample=0.2, reg_alpha=100)

regressor.fit(X_train,y_train)

print("Regression Score: {}".format(regressor.score(X_test, y_test)))

Regression Score: 0.9782885672947745
