In [None]:
require(xgboost)
library(caret)
library(readr)

# load raw data
train = read_csv('train.csv')
test = read_csv('test.csv')

# Create the response variable
y = train$Hazard

# Create the predictor data set and encode categorical variables using caret library.
mtrain = train[,-c(1,2)]
mtest = test[,-c(1)]
dummies <- dummyVars(~ ., data = mtrain)
mtrain = predict(dummies, newdata = mtrain)
mtest = predict(dummies, newdata = mtest)

# Set necessary parameters and use parallel threads
param <- list("objective" = "reg:linear", "nthread" = 8, "verbose"=0)

# Fit the model
xgb.fit = xgboost(param=param, data = mtrain, label = y, nrounds=1500, eta = .01, max_depth = 7, 
                  min_child_weight = 5, scale_pos_weight = 1.0, subsample=0.8) 

# Predict Hazard for the test set
submission <- data.frame(Id=test$Id)
submission$Hazard <- predict(xgb.fit, mtest)
write_csv(submission, "xgbboost_r_benchmark.csv")

# Let’s see what the model looks like.
model <- xgb.dump(xgb.fit, with.stats = T)
model[1:10]

# Bar graph representing each feature by a horizontal bar. 
# The longer the bar, the more important is the feature. 
# Features are classified by importance and clustered by importance.
names <- dimnames(mtrain)[[2]]
importance_matrix <- xgb.importance(names, model = xgb.fit)
xgb.plot.importance(importance_matrix[1:10,])

Loading required package: xgboost
Parsed with column specification:
cols(
  .default = col_integer(),
  T1_V4 = col_character(),
  T1_V5 = col_character(),
  T1_V6 = col_character(),
  T1_V7 = col_character(),
  T1_V8 = col_character(),
  T1_V9 = col_character(),
  T1_V11 = col_character(),
  T1_V12 = col_character(),
  T1_V15 = col_character(),
  T1_V16 = col_character(),
  T1_V17 = col_character(),
  T2_V3 = col_character(),
  T2_V5 = col_character(),
  T2_V11 = col_character(),
  T2_V12 = col_character(),
  T2_V13 = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_integer(),
  T1_V4 = col_character(),
  T1_V5 = col_character(),
  T1_V6 = col_character(),
  T1_V7 = col_character(),
  T1_V8 = col_character(),
  T1_V9 = col_character(),
  T1_V11 = col_character(),
  T1_V12 = col_character(),
  T1_V15 = col_character(),
  T1_V16 = col_character(),
  T1_V17 = col_character(),
  T2_V3 = col_character(),
  T2_V5 = col_

In [2]:
install.packages('xgboost')


  There is a binary version available but the source version is later:
        binary  source needs_compilation
xgboost  0.6-4 0.6.4.1              TRUE



installing the source package ‘xgboost’

