In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import roc_curve, auc
from sklearn import model_selection, metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn import preprocessing
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from matplotlib.pylab import rcParams
import matplotlib.pyplot as plt
import warnings
import graphviz as gv
import re
from IPython.core.display import display

warnings.filterwarnings('ignore')
%matplotlib inline
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999



In [2]:
def predict(alg, data, cv_folds=5):
    labels = data['label']
    cv = StratifiedKFold(labels, n_folds=4, shuffle=True)
    a = 0.25
    
    for i, (train, test) in enumerate(cv):
        data_train = data.iloc[train, :][2:]
        data_test = data.iloc[test, :][2:]
        
        print('Fit XGBoost')
        
        labels_train = np.array(data_train['label'])
        labels_test = np.array(data_test['label'])
        
        data_train = data_train.drop(['label'], axis=1).fillna(0)
        data_test = data_test.drop(['label'], axis=1).fillna(0)
        
        data_columns = data_train.columns
        #display(data_train.head(10))
        std_scale = preprocessing.MinMaxScaler().fit(data_train[data_columns])
        #std_scale = preprocessing.StandardScaler().fit(data_train[data_columns])
        data_train[data_columns] = std_scale.transform(data_train[data_columns])
        data_test[data_columns] = std_scale.transform(data_test[data_columns])
        
        #display(data_train.head(10))
        
        model = alg.fit(np.matrix(data_train), labels_train)
        dtrain_predictions = model.predict(np.matrix(data_test))
        dtrain_predprob = model.predict_proba(np.matrix(data_test))[:,1]
        dtrain_predictions = (dtrain_predprob > a).astype(int)
        print(dtrain_predictions)
        print(labels_test)

        ch = zip(labels_test, dtrain_predictions)
        tp, tn, fp, fn = 0, 0, 0, 0
        for value, prediction in ch:
            if (prediction and value):
                tp += 1
            if (prediction and not value):
                fp += 1
            if (not prediction and value):
                fn += 1
            if (not prediction and not value):
                tn += 1
        
        print ("\nModel quality")
        print ("Accuracy : %.4g" % metrics.accuracy_score(labels_test, dtrain_predictions))
        print ("AUC Score : %f" % metrics.roc_auc_score(labels_test, dtrain_predprob))
        print ("TP:" + str(tp) + ", TN:" + str(tn) + ", FP:" + str(fp) + ", FN:"+ str(fn))
        print ("Precision Score : %f" % metrics.precision_score(labels_test, dtrain_predictions))
        print ("Recall Score : %f" % metrics.recall_score(labels_test, dtrain_predictions))

        
        fpr, tpr, _ = roc_curve(labels_test, dtrain_predprob)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
        plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic example')
        plt.legend(loc="lower right")
        plt.savefig('lda'+'.png', format='png', dpi=100)
        plt.show()
        

In [4]:
xgb = XGBClassifier(
 learning_rate =0.01,
 n_estimators=200,
 max_depth=6,
 subsample=0.7,
 colsample_bytree=1,
 objective= 'binary:logistic',
 nthread=4,
 seed=15)