In [2]:
# Here I will use the Iris dataset
# First you load the dataset from sklearn, where X will be the data, y – the class labels

from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

In [9]:
# Then you split the data into train and test sets with 80-20% split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Xgboost can work with numpy arrays directly
# import xgboost as xgb

# dtrain = xgb.DMatrix(X_train, label=y_train)
# dtest = xgb.DMatrix(X_test, label=y_test)

In [11]:
# If you want to use svmlight for less memory consumption, first dumpthe numpy array into svmlight format 
# and then just pass the filename to DMatrix

import xgboost as xgb
from sklearn.datasets import dump_svmlight_file

dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

[11:49:35] 120x4 matrix with 480 entries loaded from dtrain.svm
[11:49:35] 30x4 matrix with 120 entries loaded from dtest.svm


In [13]:
# Now for the Xgboost to work you need to set the parameters
param = {
    'max_depth': 3, # the maximum depth of each tree
    'eta': 0.3, # the training step for each iteration
    'silent': 1, # logging mode - quiet
    'objective': 'multi:softprob', # error evaluation for multiclass training
    'num_class': 3 # the number of classes that exist in this datset
}

num_round = 20 # the number of training iterations

In [15]:
# Finally the training can begin. You just type

bst = xgb.train(param, dtrain, num_round)

In [16]:
# To see how the model looks you can also dump it in human readable form

bst.dump_model('dump.raw.txt')

In [17]:
# Use the model to predict classes for the test set

preds = bst.predict(dtest)

In [20]:
import numpy as np

best_preds = np.asarray([np.argmax(line) for line in preds])

In [23]:
# Determine the precision of this prediction

from sklearn.metrics import precision_score

print(precision_score(y_test, best_preds, average='macro'))

1.0


In [24]:
# Now save the model for later use

from sklearn.externals import joblib

joblib.dump(bst, 'bst_model.pkl', compress=True)

['bst_model.pkl']

In [25]:
# If you want to load it later

# bst = joblib.load('bst_model.pkl')