In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from sklearn.cross_validation import train_test_split

rcParams['figure.figsize'] = 12, 4

#train = pd.read_csv('train_modified.csv')
#target = 'Disbursed'
#IDcol = 'ID'



In [5]:
def modelfit(alg, X_train, X_test, y_train, y_test, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        #xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        xgtrain = xgb.DMatrix(X_train, label=y_train)
        
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    #alg.fit(dtrain[predictors], dtrain['Disbursed'], eval_metric='auc')
    alg.fit(X_train, y_train, eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_test)
    dtrain_predprob = alg.predict_proba(X_test)[:,1]
        
    precision = metrics.precision_score(y_test, dtrain_predictions)
    recall = metrics.recall_score(y_test, dtrain_predictions)
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(y_test, dtrain_predictions))
    print('precision: %.2f%%, recall: %.2f%%' % (100 * precision, 100 * recall))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_test, dtrain_predprob))

                    
#    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
#    feat_imp.plot(kind='bar', title='Feature Importances')
#    plt.ylabel('Feature Importance Score')

In [6]:
#Choose all predictors except target & IDcols
# load data
headers = ['CK_AGMT_BAL', 'CK_COMP_LAST_MON_BAL1', 'CK_COMP_LAST_MON_BAL2', 'CK_COMP_LAST_MON_BAL3', 'CK_YEAR_AVG_BAL', 'CK_COMP_LAST_MON_YEARAVG_BAL1', 'CK_COMP_LAST_MON_YEARAVG_BAL2', 'CK_COMP_LAST_MON_YEARAVG_BAL3', 'DK_AGMT_BAL', 'DK_COMP_LAST_MON_BAL1', 'DK_COMP_LAST_MON_BAL2', 'DK_COMP_LAST_MON_BAL3', 'DK_COMP_LAST_MON_YEARAVG_BAL1', 'DK_COMP_LAST_MON_YEARAVG_BAL2', 'DK_COMP_LAST_MON_YEARAVG_BAL3', 'FIN_BAL', 'FIN_BAL_SM', 'FIN_BAL_SQ', 'YEAR_FIN_BAL', 'YEAR_FIN_BAL_SM', 'YEAR_FIN_BAL_SQ', 'DF_AMT', 'DF_RT', 'DF_CNT', 'LST_1M_AMT', 'LST_1M_TT', 'LST_1M_CNT', 'LST_2M_AMT', 'LST_2M_TT', 'LST_2M_CNT', 'DK_SHUI', 'DK_DIAN', 'DK_MEI', 'IND_JBH', 'IND_WY', 'IND_YTZH', 'IND_YTCYL', 'IND_BGYDT', 'IND_JST', 'IND_DF', 'IND_CCARD', 'IND_FUND', 'IND_QS', 'IND_TXP', 'IND_BOND', 'IND_SX', 'IND_BX', 'IND_DLB', 'IND_SFCG', 'IND_YQT', 'IND_SJYH', 'IND_TALC', 'IND_GOLD', 'IND_JYT', 'IND_EJZH', 'IND_XJC', 'IND_YTDB', 'IND_PJC', 'IND_WBXJC', 'IND_LDXJC', 'IND_JZSF', 'IND_YQZTC', 'IND_YTSZ', 'IND_BZT', 'IND_ZBT', 'IND_JSK', 'IND_EJFH', 'IND_DLZY', 'IND_DLDF', 'IND_DLMK', 'IND_ZHQC', 'IND_DFGZ', 'IND_CKH', 'IND_HQCK', 'IND_XDCK', 'IND_DWHQ', 'IND_SG', 'IND_YQZZ', 'IND_RZRQ', 'REG_CAPT_AMT', 'A00', 'A01', 'A02', 'B00', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B10', 'B11', 'B12', 'B13', 'B14', 'B15', 'C00', 'C01', 'C02', 'C03', 'D00', 'D01', 'D02', 'D03', 'D04', 'E00', 'E01', 'E02', 'F00', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07', 'G00', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'M00', 'M01', 'M02', 'M03', 'M04', 'M05', 'M06', 'M07', 'Y01', 'Z00', 'Z01', 'INTER_RATING_CD', 'O_INDUSTRY_CD', 'S_INDUSTRY_CD', 'T_INDUSTRY_CD', 'F_INDUSTRY_CD', 'CUS_SCALE_TP_CD', 'ORG_TP_CD', 'INDUSTRY_TP_CD', 'INDUSTRY_MAIN_TP_CD', 'CUS_LAYER_CD', 'CUS_CLASS_CD', 'CUS_GRADE_CD', 'LMT_STATE_CD', 'CREDIT_LINE_CD', 'AERA', 'BAS_AC_IS_BOCOM', 'OWN_GROUP_LAYER', 'OWN_GROUP_CLASS', 'OWN_GROUP_GRADE', 'STRATEGIC_CUST', 'STRATEGIC_MGR_CODE', 'APPROVED_LMT', 'APPROVED_RISK_LMT', 'BAL', 'RISK_BAL', 'ORI_BAL', 'ZT_LMT', 'LXD_LMT', 'ENTSTATUS', 'IS_SHIXIN', 'IS_ANDI', 'IS_PERFORMANCE', 'IS_GL_SHIXIN', 'IS_GL_ANDI', 'IS_GL_PERFORMANCE', 'IS_YJ']
update_pd = pd.read_csv("C:\\Dev\TSS\\Bank_COMM\\risk_update_3.csv", sep = ',', names = headers)
    
#test_pd = update_pd.fillna(0)

# let xgboost to choose best missing data
test_pd = update_pd
    
X_train, X_test, y_train, y_test = train_test_split(test_pd.iloc[:,1:172], test_pd['IS_YJ'], test_size=0.3, random_state=0)

#predictors = [x for x in train.columns if x not in [target, IDcol]]

predictors = ['CK_AGMT_BAL', 'CK_COMP_LAST_MON_BAL1', 'CK_COMP_LAST_MON_BAL2', 'CK_COMP_LAST_MON_BAL3', 'CK_YEAR_AVG_BAL', 'CK_COMP_LAST_MON_YEARAVG_BAL1', 'CK_COMP_LAST_MON_YEARAVG_BAL2', 'CK_COMP_LAST_MON_YEARAVG_BAL3', 'DK_AGMT_BAL', 'DK_COMP_LAST_MON_BAL1', 'DK_COMP_LAST_MON_BAL2', 'DK_COMP_LAST_MON_BAL3', 'DK_COMP_LAST_MON_YEARAVG_BAL1', 'DK_COMP_LAST_MON_YEARAVG_BAL2', 'DK_COMP_LAST_MON_YEARAVG_BAL3', 'FIN_BAL', 'FIN_BAL_SM', 'FIN_BAL_SQ', 'YEAR_FIN_BAL', 'YEAR_FIN_BAL_SM', 'YEAR_FIN_BAL_SQ', 'DF_AMT', 'DF_RT', 'DF_CNT', 'LST_1M_AMT', 'LST_1M_TT', 'LST_1M_CNT', 'LST_2M_AMT', 'LST_2M_TT', 'LST_2M_CNT', 'DK_SHUI', 'DK_DIAN', 'DK_MEI', 'IND_JBH', 'IND_WY', 'IND_YTZH', 'IND_YTCYL', 'IND_BGYDT', 'IND_JST', 'IND_DF', 'IND_CCARD', 'IND_FUND', 'IND_QS', 'IND_TXP', 'IND_BOND', 'IND_SX', 'IND_BX', 'IND_DLB', 'IND_SFCG', 'IND_YQT', 'IND_SJYH', 'IND_TALC', 'IND_GOLD', 'IND_JYT', 'IND_EJZH', 'IND_XJC', 'IND_YTDB', 'IND_PJC', 'IND_WBXJC', 'IND_LDXJC', 'IND_JZSF', 'IND_YQZTC', 'IND_YTSZ', 'IND_BZT', 'IND_ZBT', 'IND_JSK', 'IND_EJFH', 'IND_DLZY', 'IND_DLDF', 'IND_DLMK', 'IND_ZHQC', 'IND_DFGZ', 'IND_CKH', 'IND_HQCK', 'IND_XDCK', 'IND_DWHQ', 'IND_SG', 'IND_YQZZ', 'IND_RZRQ', 'REG_CAPT_AMT', 'A00', 'A01', 'A02', 'B00', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B10', 'B11', 'B12', 'B13', 'B14', 'B15', 'C00', 'C01', 'C02', 'C03', 'D00', 'D01', 'D02', 'D03', 'D04', 'E00', 'E01', 'E02', 'F00', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07', 'G00', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'M00', 'M01', 'M02', 'M03', 'M04', 'M05', 'M06', 'M07', 'Y01', 'Z00', 'Z01', 'INTER_RATING_CD', 'O_INDUSTRY_CD', 'S_INDUSTRY_CD', 'T_INDUSTRY_CD', 'F_INDUSTRY_CD', 'CUS_SCALE_TP_CD', 'ORG_TP_CD', 'INDUSTRY_TP_CD', 'INDUSTRY_MAIN_TP_CD', 'CUS_LAYER_CD', 'CUS_CLASS_CD', 'CUS_GRADE_CD', 'LMT_STATE_CD', 'CREDIT_LINE_CD', 'AERA', 'BAS_AC_IS_BOCOM', 'OWN_GROUP_LAYER', 'OWN_GROUP_CLASS', 'OWN_GROUP_GRADE', 'STRATEGIC_CUST', 'STRATEGIC_MGR_CODE', 'APPROVED_LMT', 'APPROVED_RISK_LMT', 'BAL', 'RISK_BAL', 'ORI_BAL', 'ZT_LMT', 'LXD_LMT', 'ENTSTATUS', 'IS_SHIXIN', 'IS_ANDI', 'IS_PERFORMANCE', 'IS_GL_SHIXIN', 'IS_GL_ANDI', 'IS_GL_PERFORMANCE']

'''
Model Report
Accuracy : 0.9201
AUC Score (Train): 0.854075
precision: 56.93%, recall: 15.06%
'''

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=8,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, X_train, X_test, y_train, y_test, predictors)



Model Report
Accuracy : 0.9225
precision: 61.49%, recall: 17.57%
AUC Score (Train): 0.854712


  if diff:


In [37]:
param_test1 = {
# 'max_depth':range(3,10,2),
 'max_depth':[3,5,7,9],
# 'min_child_weight':range(1,6,2)
 'min_child_weight':[1,3,5,8,10,12]
}

'''
([mean: 0.85085, std: 0.00685, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.84975, std: 0.00706, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.85120, std: 0.00723, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.85678, std: 0.00528, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.85471, std: 0.00455, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.85658, std: 0.00650, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.85623, std: 0.00460, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.85761, std: 0.00407, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.85603, std: 0.00286, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.85615, std: 0.00562, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.85585, std: 0.00406, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.85676, std: 0.00449, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 7, 'min_child_weight': 3},
 0.8576068961640265)
'''
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.85085, std: 0.00685, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.84975, std: 0.00706, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.85120, std: 0.00723, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.85678, std: 0.00528, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.85471, std: 0.00455, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.85658, std: 0.00650, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.85623, std: 0.00460, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.85761, std: 0.00407, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.85603, std: 0.00286, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.85615, std: 0.00562, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.85585, std: 0.00406, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.85676, std: 0.00449, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 7, 'min_child_weight': 3

In [38]:
param_test2 = {
 'max_depth':[6,7,8],
 'min_child_weight':[2,3,4]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

'''
([mean: 0.85661, std: 0.00402, params: {'max_depth': 6, 'min_child_weight': 2},
  mean: 0.85629, std: 0.00388, params: {'max_depth': 6, 'min_child_weight': 3},
  mean: 0.85834, std: 0.00271, params: {'max_depth': 6, 'min_child_weight': 4},
  mean: 0.85449, std: 0.00367, params: {'max_depth': 7, 'min_child_weight': 2},
  mean: 0.85761, std: 0.00407, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.85684, std: 0.00299, params: {'max_depth': 7, 'min_child_weight': 4},
  mean: 0.85398, std: 0.00380, params: {'max_depth': 8, 'min_child_weight': 2},
  mean: 0.85549, std: 0.00288, params: {'max_depth': 8, 'min_child_weight': 3},
  mean: 0.85701, std: 0.00285, params: {'max_depth': 8, 'min_child_weight': 4}],
 {'max_depth': 6, 'min_child_weight': 4},
 0.8583362939730854)
'''

([mean: 0.85661, std: 0.00402, params: {'max_depth': 6, 'min_child_weight': 2},
  mean: 0.85629, std: 0.00388, params: {'max_depth': 6, 'min_child_weight': 3},
  mean: 0.85834, std: 0.00271, params: {'max_depth': 6, 'min_child_weight': 4},
  mean: 0.85449, std: 0.00367, params: {'max_depth': 7, 'min_child_weight': 2},
  mean: 0.85761, std: 0.00407, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.85684, std: 0.00299, params: {'max_depth': 7, 'min_child_weight': 4},
  mean: 0.85398, std: 0.00380, params: {'max_depth': 8, 'min_child_weight': 2},
  mean: 0.85549, std: 0.00288, params: {'max_depth': 8, 'min_child_weight': 3},
  mean: 0.85701, std: 0.00285, params: {'max_depth': 8, 'min_child_weight': 4}],
 {'max_depth': 6, 'min_child_weight': 4},
 0.8583362939730854)

In [46]:
param_test2b = {
 'min_child_weight':[4,6]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2b, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
gsearch2b.fit(X_train,y_train)
gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_

'''
([mean: 0.85572, std: 0.00602, params: {'min_child_weight': 4},
  mean: 0.85358, std: 0.00686, params: {'min_child_weight': 6}],
 {'min_child_weight': 4},
 0.8557219639498523)
'''

([mean: 0.85572, std: 0.00602, params: {'min_child_weight': 4},
  mean: 0.85358, std: 0.00686, params: {'min_child_weight': 6}],
 {'min_child_weight': 4},
 0.8557219639498523)

In [45]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

'''
([mean: 0.85358, std: 0.00686, params: {'gamma': 0.0},
  mean: 0.85437, std: 0.00628, params: {'gamma': 0.1},
  mean: 0.85447, std: 0.00542, params: {'gamma': 0.2},
  mean: 0.85478, std: 0.00499, params: {'gamma': 0.3},
  mean: 0.85528, std: 0.00650, params: {'gamma': 0.4}],
 {'gamma': 0.4},
 0.8552776038698229)
'''

([mean: 0.85358, std: 0.00686, params: {'gamma': 0.0},
  mean: 0.85437, std: 0.00628, params: {'gamma': 0.1},
  mean: 0.85447, std: 0.00542, params: {'gamma': 0.2},
  mean: 0.85478, std: 0.00499, params: {'gamma': 0.3},
  mean: 0.85528, std: 0.00650, params: {'gamma': 0.4}],
 {'gamma': 0.4},
 0.8552776038698229)

In [47]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=6,
 min_child_weight=4,
 gamma=0.4,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2,  X_train, X_test, y_train, y_test, predictors)

'''
Model Report
Accuracy : 0.9204
AUC Score (Train): 0.857030
precision: 57.55%, recall: 15.44%
'''


Model Report
Accuracy : 0.9204
AUC Score (Train): 0.857030
precision: 57.55%, recall: 15.44%


  if diff:


In [53]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
'''
([mean: 0.85298, std: 0.00695, params: {'colsample_bytree': 0.6, 'subsample': 0.6},
  mean: 0.85408, std: 0.00567, params: {'colsample_bytree': 0.6, 'subsample': 0.7},
  mean: 0.85724, std: 0.00275, params: {'colsample_bytree': 0.6, 'subsample': 0.8},
  mean: 0.85511, std: 0.00456, params: {'colsample_bytree': 0.6, 'subsample': 0.9},
  mean: 0.85452, std: 0.00646, params: {'colsample_bytree': 0.7, 'subsample': 0.6},
  mean: 0.85659, std: 0.00622, params: {'colsample_bytree': 0.7, 'subsample': 0.7},
  mean: 0.85723, std: 0.00502, params: {'colsample_bytree': 0.7, 'subsample': 0.8},
  mean: 0.85503, std: 0.00532, params: {'colsample_bytree': 0.7, 'subsample': 0.9},
  mean: 0.85489, std: 0.00326, params: {'colsample_bytree': 0.8, 'subsample': 0.6},
  mean: 0.85579, std: 0.00246, params: {'colsample_bytree': 0.8, 'subsample': 0.7},
  mean: 0.85808, std: 0.00623, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.85666, std: 0.00183, params: {'colsample_bytree': 0.8, 'subsample': 0.9},
  mean: 0.85502, std: 0.00644, params: {'colsample_bytree': 0.9, 'subsample': 0.6},
  mean: 0.85592, std: 0.00651, params: {'colsample_bytree': 0.9, 'subsample': 0.7},
  mean: 0.85480, std: 0.00369, params: {'colsample_bytree': 0.9, 'subsample': 0.8},
  mean: 0.85435, std: 0.00395, params: {'colsample_bytree': 0.9, 'subsample': 0.9}],
 {'colsample_bytree': 0.8, 'subsample': 0.8},
 0.8580817952553152)
'''
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=6,
 min_child_weight=4, gamma=0.4, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.85298, std: 0.00695, params: {'colsample_bytree': 0.6, 'subsample': 0.6},
  mean: 0.85408, std: 0.00567, params: {'colsample_bytree': 0.6, 'subsample': 0.7},
  mean: 0.85724, std: 0.00275, params: {'colsample_bytree': 0.6, 'subsample': 0.8},
  mean: 0.85511, std: 0.00456, params: {'colsample_bytree': 0.6, 'subsample': 0.9},
  mean: 0.85452, std: 0.00646, params: {'colsample_bytree': 0.7, 'subsample': 0.6},
  mean: 0.85659, std: 0.00622, params: {'colsample_bytree': 0.7, 'subsample': 0.7},
  mean: 0.85723, std: 0.00502, params: {'colsample_bytree': 0.7, 'subsample': 0.8},
  mean: 0.85503, std: 0.00532, params: {'colsample_bytree': 0.7, 'subsample': 0.9},
  mean: 0.85489, std: 0.00326, params: {'colsample_bytree': 0.8, 'subsample': 0.6},
  mean: 0.85579, std: 0.00246, params: {'colsample_bytree': 0.8, 'subsample': 0.7},
  mean: 0.85808, std: 0.00623, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.85666, std: 0.00183, params: {'colsample_bytree': 0.8, 'subsample'

In [55]:
param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
'''
([mean: 0.85652, std: 0.00395, params: {'colsample_bytree': 0.75, 'subsample': 0.75},
  mean: 0.85743, std: 0.00421, params: {'colsample_bytree': 0.75, 'subsample': 0.8},
  mean: 0.85489, std: 0.00585, params: {'colsample_bytree': 0.75, 'subsample': 0.85},
  mean: 0.85530, std: 0.00557, params: {'colsample_bytree': 0.8, 'subsample': 0.75},
  mean: 0.85808, std: 0.00623, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.85529, std: 0.00638, params: {'colsample_bytree': 0.8, 'subsample': 0.85},
  mean: 0.85700, std: 0.00492, params: {'colsample_bytree': 0.85, 'subsample': 0.75},
  mean: 0.85531, std: 0.00333, params: {'colsample_bytree': 0.85, 'subsample': 0.8},
  mean: 0.85322, std: 0.00490, params: {'colsample_bytree': 0.85, 'subsample': 0.85}],
 {'colsample_bytree': 0.8, 'subsample': 0.8},
 0.8580817952553152)
'''

gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=6,
 min_child_weight=4, gamma=0.4, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: 0.85652, std: 0.00395, params: {'colsample_bytree': 0.75, 'subsample': 0.75},
  mean: 0.85743, std: 0.00421, params: {'colsample_bytree': 0.75, 'subsample': 0.8},
  mean: 0.85489, std: 0.00585, params: {'colsample_bytree': 0.75, 'subsample': 0.85},
  mean: 0.85530, std: 0.00557, params: {'colsample_bytree': 0.8, 'subsample': 0.75},
  mean: 0.85808, std: 0.00623, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.85529, std: 0.00638, params: {'colsample_bytree': 0.8, 'subsample': 0.85},
  mean: 0.85700, std: 0.00492, params: {'colsample_bytree': 0.85, 'subsample': 0.75},
  mean: 0.85531, std: 0.00333, params: {'colsample_bytree': 0.85, 'subsample': 0.8},
  mean: 0.85322, std: 0.00490, params: {'colsample_bytree': 0.85, 'subsample': 0.85}],
 {'colsample_bytree': 0.8, 'subsample': 0.8},
 0.8580817952553152)

In [56]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

'''
([mean: 0.85808, std: 0.00623, params: {'reg_alpha': 1e-05},
  mean: 0.85604, std: 0.00551, params: {'reg_alpha': 0.01},
  mean: 0.85773, std: 0.00358, params: {'reg_alpha': 0.1},
  mean: 0.85798, std: 0.00677, params: {'reg_alpha': 1},
  mean: 0.79182, std: 0.01117, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.858081795255315)
'''

gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=6,
 min_child_weight=4, gamma=0.4, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(X_train,y_train)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

([mean: 0.85808, std: 0.00623, params: {'reg_alpha': 1e-05},
  mean: 0.85604, std: 0.00551, params: {'reg_alpha': 0.01},
  mean: 0.85773, std: 0.00358, params: {'reg_alpha': 0.1},
  mean: 0.85798, std: 0.00677, params: {'reg_alpha': 1},
  mean: 0.79182, std: 0.01117, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.858081795255315)

In [57]:
param_test7 = {
 'reg_alpha':[1e-7, 1e-6, 1e-5]
}
'''
([mean: 0.85808, std: 0.00623, params: {'reg_alpha': 1e-07},
  mean: 0.85808, std: 0.00623, params: {'reg_alpha': 1e-06},
  mean: 0.85808, std: 0.00623, params: {'reg_alpha': 1e-05}],
 {'reg_alpha': 1e-06},
 0.8580817952553152)
'''
gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=6,
 min_child_weight=4, gamma=0.4, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch7.fit(X_train,y_train)
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

([mean: 0.85808, std: 0.00623, params: {'reg_alpha': 1e-07},
  mean: 0.85808, std: 0.00623, params: {'reg_alpha': 1e-06},
  mean: 0.85808, std: 0.00623, params: {'reg_alpha': 1e-05}],
 {'reg_alpha': 1e-06},
 0.8580817952553152)

In [60]:
xgb3 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=6,
 min_child_weight=4,
 gamma=0.4,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=1e-06,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

'''
Model Report
Accuracy : 0.9204
AUC Score (Train): 0.857030
precision: 57.55%, recall: 15.44%
'''

modelfit(xgb3, X_train, X_test, y_train, y_test, predictors)


Model Report
Accuracy : 0.9204
AUC Score (Train): 0.857030
precision: 57.55%, recall: 15.44%


  if diff:


In [61]:
xgb4 = XGBClassifier(
 learning_rate =0.01,
 n_estimators=5000,
 max_depth=6,
 min_child_weight=4,
 gamma=0.4,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=1e-06,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

'''
Model Report
Accuracy : 0.9211
AUC Score (Train): 0.859515
precision: 59.40%, recall: 15.25%
'''
modelfit(xgb4, X_train, X_test, y_train, y_test, predictors)

  if diff:



Model Report
Accuracy : 0.9211
AUC Score (Train): 0.859515
precision: 59.40%, recall: 15.25%
