In [1]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
#from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

train = pd.read_csv('param_tune.csv', sep='\t', encoding='utf-8', engine='python')
target = 'noncoding'
IDcol = '#ID'

In [2]:
lab = pd.get_dummies(train['#label'])


In [3]:
def modelfit(alg, dtrain, lab, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=lab[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], lab[target],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(lab[target].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(lab[target], dtrain_predprob)
                    
    #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')

In [4]:
#Choose all predictors except target & IDcols
predictors = ['gravy', 'Mw', 'pI', 'GC3', 'Fickett_score']
#predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, lab, predictors)


Model Report
Accuracy : 0.9734
AUC Score (Train): 0.997271


In [5]:
import time
start = time.time()

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],lab[target])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

elapsed_time_fl = (time.time() - start)

print(gsearch1.best_params_, gsearch1.best_score_)
print(elapsed_time_fl)

({'max_depth': 7, 'min_child_weight': 1}, 0.9923773076653788)
40.4301569462


In [7]:
print(gsearch1.best_params_, gsearch1.best_score_)

({'max_depth': 7, 'min_child_weight': 1}, 0.9923773076653788)


In [6]:
param_test2 = {
 'max_depth':[6,7,8],
 'min_child_weight':[0,1,2]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=7,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors],lab[target])
print(gsearch2.best_params_, gsearch2.best_score_)

({'max_depth': 8, 'min_child_weight': 0}, 0.9924982230356237)


In [7]:
import time
start = time.time()

param_test2b = {
 'max_depth':[8, 9, 10]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=8,
 min_child_weight=0, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2b, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2b.fit(train[predictors],lab[target])
print(gsearch2b.best_params_, gsearch2b.best_score_)

elapsed_time_fl = (time.time() - start)
print(elapsed_time_fl)

({'max_depth': 10}, 0.992583512402588)
14.8066051006


In [8]:
import time
start = time.time()


param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
 min_child_weight=0, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],lab[target])
print(gsearch3.best_params_, gsearch3.best_score_)

elapsed_time_fl = (time.time() - start)
print(elapsed_time_fl)

({'gamma': 0.1}, 0.9925845732446581)
27.5472807884


In [13]:
print(gsearch3.best_score_)
elapsed_time_fl = (time.time() - start)
print(elapsed_time_fl)

0.9925845732446581
1660.97546792


In [9]:
import time
start = time.time()



param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=10,
 min_child_weight=0, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],lab[target])
print gsearch4.best_params_, gsearch4.best_score_

elapsed_time_fl = (time.time() - start)
print(elapsed_time_fl)

{'subsample': 0.9, 'colsample_bytree': 0.6} 0.9926336376905012
105.747136831


In [10]:
import time
start = time.time()


param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=10,
 min_child_weight=0, gamma=0.1, subsample=0.9, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train[predictors],lab[target])

print gsearch5.best_params_, gsearch5.best_score_

elapsed_time_fl = (time.time() - start)
print(elapsed_time_fl)

{'subsample': 0.85, 'colsample_bytree': 0.75} 0.9926142699577086
63.0691168308
