In [1]:
# Base

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Plot settings

%matplotlib inline
sns.set_context('talk')
sns.set_palette('gray')
sns.set_style('ticks', {'grid.color' : '0.9'})

In [3]:
# Algorithms

import xgboost as xgb

In [4]:
# Model selection

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score

In [5]:
x_train = np.load('processed/wells_feature_matrix.npy')
y_train = pd.read_csv('processed/wells_labels_train.csv')['status'].tolist()

In [19]:
xgb_clf = xgb.XGBClassifier(n_estimators=500, learning_rate=0.001, random_state=20130810)

In [20]:
%%time
scores = cross_val_score(xgb_clf,
                         x_train,
                         y_train, 
                         scoring = 'accuracy',
                         cv = 5,
                         n_jobs = 3,
                         verbose = 3)

[Parallel(n_jobs=3)]: Done   2 out of   5 | elapsed:  8.3min remaining: 12.4min


Wall time: 14min 47s


[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 14.8min finished


We begin by looking at the variation in the performance as we hand-tune some of the parameters. We will then put together a parameter grid that will possiblly find the best combination among all the parameters. For now, we will look at accuracy as the metric. At the moment the focus is on observing the contours of accuracy as we change the hyperparameters

In [21]:
scores.mean(), scores.std()

(0.6965491846046559, 0.0040308046525066215)

In [18]:
print(xgb_clf)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='multi:softprob',
       random_state=20130810, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=None, silent=True, subsample=1)


In [29]:
pgrid = {'max_depth' : [10, 20],
         'n_estimators' : [300, 500]}

In [32]:
xgb_classif = xgb.XGBClassifier(learning_rate=0.1, random_state=20130810, silent=True)

In [33]:
xgb_cv = RandomizedSearchCV(estimator = xgb_classif,
                            param_distributions = pgrid,
                            n_iter = 2,
                            cv = 5,
                            n_jobs = 3, 
                            random_state = 20130810,
                            verbose = 1)

In [34]:
%%time
xgb_cv.fit(x_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed: 136.7min finished


Wall time: 2h 37min 28s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=20130810, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=None, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=2, n_jobs=3,
          param_distributions={'max_depth': [10, 20], 'n_estimators': [300, 500]},
          pre_dispatch='2*n_jobs', random_state=20130810, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [35]:
xgb_cv.best_score_

0.8104545454545454

In [36]:
xgb_cv.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='multi:softprob',
       random_state=20130810, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=None, silent=True, subsample=1)

### Rerun with the final parameters

In [37]:
xgb_clf = xgb.XGBClassifier(max_depth=10, 
                            n_estimators=750, 
                            learning_rate=0.1, 
                            random_state=20130810, 
                            silent=True,
                            nthread=3)

In [38]:
cross_val_score(xgb_clf, 
                x_train, y_train, 
                cv=5, 
                n_jobs=-1, 
                verbose=1)

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 55.9min finished


array([0.8148304 , 0.80893864, 0.81077441, 0.80496633, 0.80686984])