In [8]:
import pandas
import numpy as np
import xgboost as xgb
from sklearn import cross_validation
from sklearn import metrics
from sklearn import grid_search
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedKFold, train_test_split, KFold
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values

In [9]:
X = array[:,:-1]
y = array[:, -1]

In [10]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=.4, random_state=2016)

In [16]:
rfc = RandomForestClassifier()
etc = ExtraTreesClassifier()

In [36]:
param_grid = {'n_estimators': [300, 500], 
              'max_features': [8],
              'criterion'   : ["gini", "entropy"]
             }
model = grid_search.GridSearchCV(estimator=etc, param_grid=param_grid, n_jobs=1, cv=10, verbose=20, scoring="roc_auc")
model.fit(train_X, train_y)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] max_features=8, n_estimators=300, criterion=gini ................
[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.782353 -   1.5s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.899020 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.7s remaining:    0.0s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.851927 -   1.3s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.1s remaining:    0.0s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.860041 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.3s remaining:    0.0s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.838742 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.5s remaining:    0.0s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.903651 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    7.7s remaining:    0.0s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.643002 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    8.9s remaining:    0.0s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.757606 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   10.1s remaining:    0.0s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.757543 -   1.3s
[CV] max_features=8, n_estimators=300, criterion=gini ................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   11.4s remaining:    0.0s


[CV]  max_features=8, n_estimators=300, criterion=gini, score=0.936422 -   1.2s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   12.6s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.800980 -   2.0s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   14.6s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.905882 -   2.0s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   16.5s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.866126 -   2.0s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:   18.6s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.858012 -   2.1s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:   20.7s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.828600 -   2.0s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   22.7s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.906694 -   2.0s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:   24.6s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.644016 -   1.9s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:   26.6s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.768763 -   2.0s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   28.6s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.755388 -   2.0s
[CV] max_features=8, n_estimators=500, criterion=gini ................


[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:   30.5s remaining:    0.0s


[CV]  max_features=8, n_estimators=500, criterion=gini, score=0.933190 -   2.0s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.800000 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.902941 -   1.3s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.861055 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.853955 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.823529 -   1.2s
[CV] max_features=8, n_estimators=300, criterion=entropy .............
[CV]  max_features=8, n_estimators=300, criterion=entropy, score=0.900609 -   1

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.1min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [300, 500], 'max_features': [8], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=20)

In [37]:
print model.best_params_
print model.best_score_

{'max_features': 8, 'n_estimators': 500, 'criterion': 'gini'}
0.826804878811


In [None]:
xgb_model = xgb.XGBClassifier()

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.3], #so called `eta` value
              'max_depth': [5,6],
              'min_child_weight': [3],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [50,100], #number of trees
              'seed': [1337]}

clf = grid_search.GridSearchCV(xgb_model, parameters, n_jobs=4, 
                   cv=10 ,
                   verbose=20, refit=True,scoring='roc_auc')

clf.fit(train_X, train_y)

y_test_predict = clf.predict(test_X)
actuals = test_y
score = metrics.accuracy_score(test_y, y_test_predict)
print score

Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.3, nthread=4, min_child_weight=3, n_estimators=50, subsample=0.7, seed=1337, objective=binary:logistic, max_depth=5 
