In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
Dtrain = xgb.DMatrix(X_train,y_train)
Dtest = xgb.DMatrix(X_test,y_test)
params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'eta': 0.1,
    'min_child_weight': 1,
}
watchlist = [(Dtrain, 'train'),(Dtest, 'test')]
num_round = 50



In [3]:
bst = xgb.train(params,Dtrain, num_round,watchlist)

[0]	train-auc:0.98800	test-auc:0.99381
[1]	train-auc:0.99542	test-auc:0.99381
[2]	train-auc:0.99555	test-auc:0.99492
[3]	train-auc:0.99554	test-auc:0.99460
[4]	train-auc:0.99565	test-auc:0.99460
[5]	train-auc:0.99575	test-auc:0.99428
[6]	train-auc:0.99590	test-auc:0.99428
[7]	train-auc:0.99848	test-auc:0.99428
[8]	train-auc:0.99853	test-auc:0.99397
[9]	train-auc:0.99848	test-auc:0.99365
[10]	train-auc:0.99883	test-auc:0.99428
[11]	train-auc:0.99929	test-auc:0.99460
[12]	train-auc:0.99931	test-auc:0.99428
[13]	train-auc:0.99952	test-auc:0.99428
[14]	train-auc:0.99964	test-auc:0.99492
[15]	train-auc:0.99960	test-auc:0.99428
[16]	train-auc:0.99971	test-auc:0.99397
[17]	train-auc:0.99967	test-auc:0.99428
[18]	train-auc:0.99967	test-auc:0.99460
[19]	train-auc:0.99971	test-auc:0.99492
[20]	train-auc:0.99975	test-auc:0.99524
[21]	train-auc:0.99979	test-auc:0.99555
[22]	train-auc:0.99981	test-auc:0.99555
[23]	train-auc:0.99987	test-auc:0.99524
[24]	train-auc:0.99985	test-auc:0.99555
[25]	train



In [4]:
Dtest.save_binary('../data/cancer/cancer.buffer')
Dtest2 = xgb.DMatrix('../data/cancer/cancer.buffer')
pred = bst.predict(Dtest2)
print(pred)

[0.01220779 0.96091884 0.9912094  0.98322254 0.98909354 0.9948566
 0.9916618  0.99161434 0.9879659  0.9954991  0.49994504 0.9839542
 0.99421483 0.16757697 0.39245218 0.08739442 0.9812754  0.00920953
 0.00924445 0.00902297 0.01922355 0.01143483 0.9892077  0.99545443
 0.01688493 0.9950158  0.99390393 0.04599945 0.995369   0.00856125
 0.99091095 0.01673225 0.9821117  0.01180545 0.99390393 0.00977522
 0.9923086  0.02249466 0.9947984  0.00998342 0.43411916 0.9929885
 0.20828968 0.99372816 0.90863013 0.00924445 0.99390393 0.9882189
 0.9925817  0.00902984 0.01421841 0.01973597 0.01031474 0.99545443
 0.99524176 0.9942059  0.98052067 0.9642527  0.9793353  0.00902297
 0.00980202 0.01201961 0.99524176 0.99438465 0.01024733 0.84502715
 0.00998479 0.00856125 0.00902297 0.98991066 0.70446074 0.00856125
 0.9954991  0.5332857  0.01229395 0.9845501  0.9944824  0.95591223
 0.99390393 0.9948566  0.0218269  0.00949073 0.00856125 0.99036247
 0.01301393 0.99325764 0.9915821  0.9954991  0.00928118 0.00856125

In [5]:
pred_train = bst.predict(Dtrain,output_margin=True)
pred_test = bst.predict(Dtest,output_margin=True)

Dtrain.set_base_margin(pred_train)
Dtest.set_base_margin(pred_test)

bst2 = xgb.train(params,Dtrain,num_round,watchlist)
# final_test = bst.predict(Dtest)
# print(final_test)

[0]	train-auc:1.00000	test-auc:0.99714
[1]	train-auc:1.00000	test-auc:0.99714
[2]	train-auc:1.00000	test-auc:0.99714
[3]	train-auc:1.00000	test-auc:0.99714
[4]	train-auc:1.00000	test-auc:0.99714
[5]	train-auc:1.00000	test-auc:0.99714
[6]	train-auc:1.00000	test-auc:0.99714
[7]	train-auc:1.00000	test-auc:0.99714
[8]	train-auc:1.00000	test-auc:0.99714
[9]	train-auc:1.00000	test-auc:0.99714
[10]	train-auc:1.00000	test-auc:0.99714
[11]	train-auc:1.00000	test-auc:0.99714
[12]	train-auc:1.00000	test-auc:0.99714
[13]	train-auc:1.00000	test-auc:0.99714
[14]	train-auc:1.00000	test-auc:0.99714
[15]	train-auc:1.00000	test-auc:0.99714
[16]	train-auc:1.00000	test-auc:0.99714
[17]	train-auc:1.00000	test-auc:0.99714
[18]	train-auc:1.00000	test-auc:0.99714
[19]	train-auc:1.00000	test-auc:0.99714
[20]	train-auc:1.00000	test-auc:0.99714
[21]	train-auc:1.00000	test-auc:0.99714
[22]	train-auc:1.00000	test-auc:0.99714
[23]	train-auc:1.00000	test-auc:0.99746
[24]	train-auc:1.00000	test-auc:0.99714
[25]	train

In [6]:
def logregobj(preds,Dtrain):
    labels = Dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hessian = preds * (1.0 - preds)
    return grad, hessian

def evalerror(preds,Dtrain):
    labels = Dtrain.get_label()
    return 'error', float(np.sum((preds > 0) != labels)) / len(labels)

In [7]:
bst3 = xgb.train(params,Dtrain,num_round,watchlist,obj = logregobj, custom_metric=evalerror)

[0]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.02632
[1]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.02632
[2]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.02632
[3]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.01754
[4]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.02632
[5]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.02632
[6]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.02632
[7]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.02632
[8]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.02632
[9]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.01754
[10]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.02632
[11]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	test-error:0.01754
[12]	train-auc:1.00000	train-error:0.00000	test-auc:0.99714	te

In [8]:
res = xgb.cv(params,Dtrain,num_round,nfold=5,seed=24,callbacks=[xgb.callback.EvaluationMonitor(period=1),
                                                                xgb.callback.EarlyStopping(rounds = 10)])

[0]	train-auc:1.00000	test-auc:1.00000
[1]	train-auc:1.00000	test-auc:1.00000
[2]	train-auc:1.00000	test-auc:1.00000
[3]	train-auc:1.00000	test-auc:1.00000
[4]	train-auc:1.00000	test-auc:1.00000
[5]	train-auc:1.00000	test-auc:1.00000
[6]	train-auc:1.00000	test-auc:1.00000
[7]	train-auc:1.00000	test-auc:1.00000
[8]	train-auc:1.00000	test-auc:1.00000
[9]	train-auc:1.00000	test-auc:1.00000


In [9]:
print(res)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0             1.0            0.0            1.0           0.0


In [10]:
def preproc(Dtrain,Dtest,params):
    labels = Dtrain.get_label()
    ratio = np.sum(labels == 1) / np.sum(labels == 0)
    params['scale_pos_weight'] = ratio
    return Dtrain, Dtest, params

bst4 = xgb.cv(params,Dtrain,num_round,nfold=5, seed= 24, fpreproc=preproc,callbacks=[xgb.callback.EvaluationMonitor(period=1),
                                                                                     xgb.callback.EarlyStopping(rounds = 10)])

[0]	train-auc:1.00000	test-auc:1.00000
[1]	train-auc:1.00000	test-auc:1.00000
[2]	train-auc:1.00000	test-auc:1.00000
[3]	train-auc:1.00000	test-auc:1.00000
[4]	train-auc:1.00000	test-auc:1.00000
[5]	train-auc:1.00000	test-auc:1.00000
[6]	train-auc:1.00000	test-auc:1.00000
[7]	train-auc:1.00000	test-auc:1.00000
[8]	train-auc:1.00000	test-auc:1.00000
[9]	train-auc:1.00000	test-auc:1.00000
[10]	train-auc:1.00000	test-auc:1.00000


In [11]:
from sklearn.metrics import roc_auc_score
label = Dtest.get_label()
preds_limit = bst.predict(Dtest,iteration_range=(0,10))
preds_without_limit = bst.predict(Dtest)
roc_auc_limit = roc_auc_score(label, preds_limit)
roc_auc_without_limit = roc_auc_score(label, preds_without_limit)
print('with limitation :' + str(roc_auc_limit))
print('without limitation :' + str(roc_auc_without_limit))

with limitation :0.9961892664337885
without limitation :0.9971419498253413


In [12]:
leaf_index = bst.predict(Dtest,iteration_range=(0,10),pred_leaf=True)
print(leaf_index.shape)
print(leaf_index)

(114, 10)
[[14. 14. 20. ... 21. 14. 14.]
 [15. 20. 22. ... 25.  7.  7.]
 [15. 20. 22. ... 24.  7.  7.]
 ...
 [14. 14. 20. ... 21. 14. 14.]
 [14. 14. 20. ... 21. 14. 14.]
 [15. 14. 11. ... 21.  9.  9.]]


In [13]:
pred_contrib = bst.predict(Dtest,pred_contribs=True)
print(pred_contrib)

[[ 8.9322776e-03 -8.8935807e-02  0.0000000e+00 ... -4.2217791e-02
  -1.9941615e-02 -4.3351498e+00]
 [ 2.3657329e-02 -3.5012072e-01  0.0000000e+00 ...  6.9230676e-02
  -1.0002808e-02  3.2604959e+00]
 [ 2.6324231e-02  3.0666345e-01  0.0000000e+00 ... -2.0713320e-02
  -1.2381128e-03  4.7834830e+00]
 ...
 [-1.1065914e-01 -4.8282631e-03  0.0000000e+00 ... -1.6415002e-02
   2.0076953e-02 -4.6424165e+00]
 [-1.1065914e-01 -1.7543592e-01  0.0000000e+00 ... -2.2523830e-02
   1.4675630e-02 -4.6287436e+00]
 [-3.6605293e-01  2.1792153e-01  0.0000000e+00 ...  5.5335082e-02
   2.1469144e-02  2.0032582e+00]]


In [14]:
pred_interaction = bst.predict(Dtest,pred_interactions=True)
print(pred_interaction)

[[[ 1.7732359e-02  0.0000000e+00  0.0000000e+00 ...  2.7462840e-04
    0.0000000e+00  0.0000000e+00]
  [ 0.0000000e+00 -1.0500293e-01  0.0000000e+00 ... -1.7413683e-03
   -2.3978008e-03  0.0000000e+00]
  [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00
    0.0000000e+00  0.0000000e+00]
  ...
  [ 2.7463073e-04 -1.7413720e-03  0.0000000e+00 ... -6.1221205e-02
    0.0000000e+00  0.0000000e+00]
  [ 0.0000000e+00 -2.3978055e-03  0.0000000e+00 ...  0.0000000e+00
   -3.5066724e-02  0.0000000e+00]
  [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00
    0.0000000e+00 -4.3351498e+00]]

 [[ 1.8436644e-02  0.0000000e+00  0.0000000e+00 ...  7.7557191e-04
    0.0000000e+00  0.0000000e+00]
  [ 0.0000000e+00 -3.0756357e-01  0.0000000e+00 ...  1.8322900e-02
   -1.5691128e-03  0.0000000e+00]
  [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00
    0.0000000e+00  0.0000000e+00]
  ...
  [ 7.7557191e-04  1.8322900e-02  0.0000000e+00 ...  3.5795201e-02
    0.0000