In [1]:
# Imports
import numpy as np
import xgboost as xgb

### Loading data

In [2]:
dtrain = xgb.DMatrix('./data/agaricus.txt.train')
dtest = xgb.DMatrix('./data/agaricus.txt.test')

In [3]:
print("Train dataset contains {0} rows and {1} columns".format(dtrain.num_row(), dtrain.num_col()))
print("Test dataset contains {0} rows and {1} columns".format(dtest.num_row(), dtest.num_col()))

Train dataset contains 6513 rows and 127 columns
Test dataset contains 1611 rows and 127 columns


In [4]:
print("Train possible labels: ")
print(np.unique(dtrain.get_label()))

print("\nTest possible labels: ")
print(np.unique(dtest.get_label()))

Train possible labels: 
[0. 1.]

Test possible labels: 
[0. 1.]


In [5]:
# Algorithm parameters
params = {
    'objective': 'binary:logistic',
    'max_depth': 2,
    'silent': 1,  # No output
    'eta': 1  # Fast and furious
}

num_rounds = 5

### Train

In [6]:
bst = xgb.train(params, dtrain, num_rounds)

In [7]:
# Observe training
watchlist  = [(dtest,'test'), (dtrain,'train')] # native interface only
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	test-error:0.042831	train-error:0.046522
[1]	test-error:0.021726	train-error:0.022263
[2]	test-error:0.006207	train-error:0.007063
[3]	test-error:0.018001	train-error:0.0152
[4]	test-error:0.006207	train-error:0.007063


### Predict

In [8]:
preds_prob = bst.predict(dtest)
preds_prob

array([0.08073306, 0.92217326, 0.08073306, ..., 0.98059034, 0.01182149,
       0.98059034], dtype=float32)

In [9]:
labels = dtest.get_label()
preds = preds_prob > 0.5 # threshold
correct = 0

for i in range(len(preds)):
    if (labels[i] == preds[i]):
        correct += 1

print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-correct/len(preds)))

Predicted correctly: 1601/1611
Error: 0.0062
