In [10]:
library(xgboost)
data(iris)

In [16]:
# Convert the Species factor to an integer class starting at 0
# This is picky, but it's a requirement for XGBoost
species = iris$Species
label = as.integer(iris$Species)-1
iris$Species = NULL

In [18]:
n = nrow(iris)
train.index = sample(n,floor(0.75*n))
train.data = as.matrix(iris[train.index,])
train.label = label[train.index]
test.data = as.matrix(iris[-train.index,])
test.label = label[-train.index]

In [19]:
# Transform the two data sets into xgb.Matrix
xgb.train = xgb.DMatrix(data=train.data,label=train.label)
xgb.test = xgb.DMatrix(data=test.data,label=test.label)

In [20]:
# Define the parameters for multinomial classification
num_class = length(levels(species))
params = list(
  booster="gbtree",
  eta=0.001,
  max_depth=5,
  gamma=3,
  subsample=0.75,
  colsample_bytree=1,
  objective="multi:softprob",
  eval_metric="mlogloss",
  num_class=num_class
)

In [26]:
length(unique(species))
length(levels(species))

In [33]:
# Train the XGBoost classifer
xgb.fit=xgb.train(
  params=params,
  data=xgb.train,
  nrounds=10000,
  nthreads=1,
  early_stopping_rounds=10,
  watchlist=list(val1=xgb.train,val2=xgb.test),
  verbose=0
)

# Review the final model and results
xgb.fit

##### xgb.Booster
raw: 3.7 Mb 
call:
  xgb.train(params = params, data = xgb.train, nrounds = 10000, 
    watchlist = list(val1 = xgb.train, val2 = xgb.test), verbose = 0, 
    early_stopping_rounds = 10, nthreads = 1)
params (as set within xgb.train):
  booster = "gbtree", eta = "0.001", max_depth = "5", gamma = "3", subsample = "0.75", colsample_bytree = "1", objective = "multi:softprob", eval_metric = "mlogloss", num_class = "3", nthreads = "1", silent = "1"
xgb.attributes:
  best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
  cb.evaluation.log()
  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
    verbose = verbose)
# of features: 4 
niter: 3309
best_iteration : 3299 
best_ntreelimit : 3299 
best_score : 0.159404 
nfeatures : 4 
evaluation_log:
    iter val1_mlogloss val2_mlogloss
       1      1.097433      1.097470
       2      1.096202      1.096206
---                                 
    3308      0.192697      0.159404
    

In [34]:
# Predict outcomes with the test data
xgb.pred = predict(xgb.fit,test.data,reshape=T)
xgb.pred = as.data.frame(xgb.pred)
colnames(xgb.pred) = levels(species)

In [38]:
head(xgb.pred)

setosa,versicolor,virginica,prediction,label
0.8932503,0.06230251,0.04444717,setosa,setosa
0.8938478,0.06167523,0.0444769,setosa,setosa
0.8938478,0.06167523,0.0444769,setosa,setosa
0.8932503,0.06230251,0.04444717,setosa,setosa
0.8932503,0.06230251,0.04444717,setosa,setosa
0.894077,0.06143461,0.04448831,setosa,setosa


In [35]:
# Use the predicted label with the highest probability
xgb.pred$prediction = apply(xgb.pred,1,function(x) colnames(xgb.pred)[which.max(x)])
xgb.pred$label = levels(species)[test.label+1]

In [36]:
# Calculate the final accuracy
result = sum(xgb.pred$prediction==xgb.pred$label)/nrow(xgb.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result)))

[1] "Final Accuracy = 100.00%"
