In [None]:

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
import seaborn as sns
#import xgboost as xgb
sns.set_style('whitegrid') 
%matplotlib inline

import xgboost as xgb
from sklearn import pipeline, grid_search
from sklearn import cross_validation, metrics

import random
random.seed(2016)

import time



In [None]:
def main(input1='../data/preprocessing_train_df.csv', input2='../data/preprocessing_test_df.csv'):

    
    
    start_time = time.time() 
    
#load preprocessed training data as a dataframe
    train_df = pd.read_csv(input1, index_col=0)
    test_df  = pd.read_csv(input2, index_col=0)

#  test_df = pd.read_csv("../data/preprocessing_test_df.csv" , index_col=0)

    x_train = train_df.iloc[:, 2:].as_matrix()
    y_train = train_df.iloc[:, 1]

    id_test = test_df['PassengerId']
    x_test = test_df.iloc[:,1:].as_matrix()
#    x_train.info()
    
    
    print('--- Features Set: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
#    print('Number of Features: ', len(x_train.columns.tolist()))
    

# Step 1: Fix learning rate and # of estimators for tuning tree-based parameters 
    clf= xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.1,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8, 
        seed=2016, 
        objective='binary:logistic', 
        nthread=4, 
        scale_pos_weight=1,
       )
        
    

    xgb_param = clf.get_xgb_params()
    xgtrain = xgb.DMatrix(x_train, y_train)    
    cv_result= xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'],
                      nfold=5, early_stopping_rounds=50, metrics='auc')
    
    clf.set_params(n_estimators=cv_result.shape[0])
    print cv_result.shape[0]
#    print cv_result
    
    #fit the classifier with the data... with tree # = num_boost_round
    result = clf.fit(x_train, y_train,eval_metric='auc')

    #predict training set
    y_train_predictions = clf.predict(x_train)
    y_train_predprob = clf.predict_proba(x_train)[:,1]
    
#    print clf.booster().attributes()

    
     #Print model report:
    print "\nModel Report"##
    print "Accuracy : %.4g" % metrics.accuracy_score(y_train.values, y_train_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_train_predprob)
                    
    feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    
######################

## Grid Search for XGBClassifier    
#    clf1 = xgb.XGBClassifier(seed=2016,objective = 'binary:logistic')
    n_tree = cv_result.shape[0]

#    print n_tree
    # First tune max_depth and min_child_weight
    param_grid1 = {
        'learning_rate':[0.03, 0.04, 0.05],
        'max_depth': [6,7, 8],
        'min_child_weight': [1, 3, 5],
        'gamma': [i/10. for i in range(0,8)],
        'subsample': [0.8], #[i/10. for i in range(6,10)],
        'colsample_bytree': [0.8], # [i/10. for i in range(6,10)]
        'reg_alpha':[0, 1, 10]
    }

    
    gsearch1 = grid_search.GridSearchCV(
        estimator=xgb.XGBClassifier(learning_rate=0.1, n_estimators = 300,  
                                gamma=0, subsample=0.8,colsample_bytree=0.8, 
                                seed=2016, objective='binary:logistic', nthread=4, 
                                scale_pos_weight=1), 
        param_grid=param_grid1, scoring='accuracy', cv=5, n_jobs=5, verbose=20)
    gsearch1.fit(x_train, y_train)
    
    
    print('--- Grid Search Completed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))#    print('Param grid:')

    print('Param grid score:')
    print gsearch1.grid_scores_
    print('Best Params:')
    print(gsearch1.best_params_)
    print('Best CV Score:')
    print(gsearch1.best_score_)

    
    y_pred = gsearch1.predict(x_test)
    pd.DataFrame({'PassengerId': id_test, 'Survived': y_pred}).to_csv('../data/submission_xgb1.csv', index=False)
    
    print('--- Submission Generated: %s minutes ---' % round(((time.time() - start_time) / 60), 2))


#if __name__ == '__main__':


main()


--- Features Set: 0.0 minutes ---
48

Model Report
Accuracy : 0.908
AUC Score (Train): 0.963986
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[CV] reg_alpha=0, colsample_bytree=0.8, learning_rate=0.03, min_child_weight=1, subsample=0.8, max_depth=6, gamma=0.0 
[CV] reg_alpha=0, colsample_bytree=0.8, learning_rate=0.03, min_child_weight=1, subsample=0.8, max_depth=6, gamma=0.0 
[CV] reg_alpha=0, colsample_bytree=0.8, learning_rate=0.03, min_child_weight=1, subsample=0.8, max_depth=6, gamma=0.0 
[CV] reg_alpha=0, colsample_bytree=0.8, learning_rate=0.03, min_child_weight=1, subsample=0.8, max_depth=6, gamma=0.0 
[CV] reg_alpha=0, colsample_bytree=0.8, learning_rate=0.03, min_child_weight=1, subsample=0.8, max_depth=6, gamma=0.0 
