In [1]:
import pandas
import numpy as np
import xgboost as xgb
from sklearn import cross_validation
from sklearn import metrics
from sklearn import grid_search
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedKFold, train_test_split, KFold
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values



In [2]:
X = array[:,:-1]
y = array[:, -1]

In [3]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=.4, random_state=2016)

In [7]:
rfc = RandomForestClassifier()
etc = ExtraTreesClassifier()
param_grid = {'n_estimators': [300, 500], 
              'max_features': [8],
              'criterion'   : ["gini", "entropy"]
             }
model = grid_search.GridSearchCV(estimator=etc, param_grid=param_grid, n_jobs=4, cv=10, verbose=20, scoring="roc_auc")
model.fit(train_X, train_y)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] max_features=8, n_estimators=300, criterion=gini ................
[CV] max_features=8, n_estimators=300, criterion=gini ................
[CV] max_features=8, n_estimators=300, criterion=gini ................
[CV] max_features=8, n_estimators=300, criterion=gini ................
[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.896078 -   2.2s
[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.796078 -   2.2s
[CV] max_features=8, n_estimators=300, criterion=gini ................
[CV] max_features=8, n_estimators=300, criterion=gini ................
[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.851927 -   2.3s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    2.4s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.863083 -   2.4s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    2.5s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.906694 -   2.3s
[CV] max_features=8, n_estimators=300, criterion=gini ................
[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.651116 -   2.2s
[CV] max_features=8, n_estimators=300, criterion=gini ................
[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.840771 -   2.4s
[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.774848 -   2.1s
[CV] max_features=8, n_estimators=500, criterion=gini ................
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    4.5s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    4.7s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.767241 -   3.3s
[CV] max_features=8, n_estimators=500, criterion=gini ................
[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.937500 -   3.4s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    7.9s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    8.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.784314 -   5.5s
[CV] max_features=8, n_estimators=500, criterion=gini ................
[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.901961 -   5.6s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:   10.2s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:   10.3s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.866126 -   4.5s
[CV] max_features=8, n_estimators=500, criterion=gini ................
[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.870183 -   4.5s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:   12.4s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:   12.5s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.821501 -   3.7s
[CV] max_features=8, n_estimators=500, criterion=gini ................
[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.904665 -   3.7s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:   13.9s
[Parallel(n_jobs=4)]: Done  16 tasks      | elapsed:   14.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.639959 -   3.7s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.758621 -   3.7s
[CV] max_features=8, n_estimators=300, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   16.1s
[Parallel(n_jobs=4)]: Done  18 tasks      | elapsed:   16.2s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.739224 -   4.7s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.941810 -   4.7s
[CV] max_features=8, n_estimators=300, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  19 tasks      | elapsed:   18.6s
[Parallel(n_jobs=4)]: Done  20 tasks      | elapsed:   18.8s


[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.801961 -   3.4s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.894118 -   3.4s
[CV] max_features=8, n_estimators=300, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  21 tasks      | elapsed:   19.5s
[Parallel(n_jobs=4)]: Done  22 tasks      | elapsed:   19.7s


[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.867140 -   4.0s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.884381 -   3.8s
[CV] max_features=8, n_estimators=300, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  23 tasks      | elapsed:   22.6s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   22.6s


[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.825558 -   3.8s
[CV] max_features=8, n_estimators=300, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  25 tasks      | elapsed:   23.4s


[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.906694 -   4.0s
[CV] max_features=8, n_estimators=300, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  26 tasks      | elapsed:   23.6s


[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.645030 -   2.6s
[CV] max_features=8, n_estimators=500, criterion=entropy .............
[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.764706 -   2.6s
[CV] max_features=8, n_estimators=500, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  27 tasks      | elapsed:   25.2s
[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed:   25.3s


[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.742457 -   2.5s
[CV] max_features=8, n_estimators=500, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  29 tasks      | elapsed:   25.9s


[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.936422 -   2.8s
[CV] max_features=8, n_estimators=500, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  30 tasks      | elapsed:   26.4s


[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.776471 -   4.6s
[CV] max_features=8, n_estimators=500, criterion=entropy .............
[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.900000 -   4.6s
[CV] max_features=8, n_estimators=500, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  31 tasks      | elapsed:   29.8s
[Parallel(n_jobs=4)]: Done  32 tasks      | elapsed:   29.9s


[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.856998 -   4.6s
[CV] max_features=8, n_estimators=500, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   30.5s


[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.872211 -   4.6s
[CV] max_features=8, n_estimators=500, criterion=entropy .............
[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.825558 -   5.8s
[CV] max_features=8, n_estimators=500, criterion=entropy .............
[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.910751 -   5.7s
[CV] max_features=8, n_estimators=500, criterion=entropy .............


[Parallel(n_jobs=4)]: Done  36 out of  40 | elapsed:   35.6s remaining:    4.0s


[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.632860 -   6.1s
[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.748479 -   6.2s
[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.759698 -   3.6s
[CV]  max_features=8, n_estimators=500, criterion=entropy, score=0.941810 -   3.6s


[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:   39.2s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [300, 500], 'max_features': [8], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=20)

In [8]:
print model.best_params_
print model.best_score_

{'max_features': 8, 'n_estimators': 300, 'criterion': 'gini'}
0.828506279948


In [4]:
xgb_model = xgb.XGBClassifier()

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.3], #so called `eta` value
              'max_depth': [5,6],
              'min_child_weight': [3],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [50,100], #number of trees
              'seed': [1337]}

clf = grid_search.GridSearchCV(xgb_model, parameters, n_jobs=4, 
                   cv=10 ,
                   verbose=20, refit=True,scoring='roc_auc')

clf.fit(train_X, train_y)

y_test_predict = clf.predict(test_X)
actuals = test_y
score = metrics.accuracy_score(test_y, y_test_predict)
print score

Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5, score=0.858824 -   0.1s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=

[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1497s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    0.3s


[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5, score=0.801217 -   0.2s
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5, score=0.728195 -   0.1s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5, score=0.868154 -   0.1s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_chi

[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.6s


[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5, score=0.725490 -   0.2s
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5, score=0.799189 -   0.2s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estima

[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done  16 tasks      | elapsed:    0.8s


[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.745098 -   0.2s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.851927 -   0.3s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5, score=0.762931 -   0.4s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_chi

[Parallel(n_jobs=4)]: Done  18 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  20 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  22 tasks      | elapsed:    1.3s


[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.771552 -   0.2s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.874239 -   0.1s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.709939 -   0.3s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_chi

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done  26 tasks      | elapsed:    1.7s


[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.890086 -   0.2s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.669371 -   0.1s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.709804 -   0.4s


[Parallel(n_jobs=4)]: Done  30 out of  40 | elapsed:    1.8s remaining:    0.6s


[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.860041 -   0.3s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.789047 -   0.4s
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estim

[Parallel(n_jobs=4)]: Done  36 out of  40 | elapsed:    2.4s remaining:    0.3s


[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=100, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=6, score=0.887931 -   0.1s
0.74025974026


[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:    2.6s finished
