# XGBoost

* Performs xgboost on training data. 
* Iterates over parameters with cross validation
* Currently ignoring date parameters due to large number of factors. Waiting for preprocessing steps to improve. 
* Warning: takes a long time to cross validate

In [75]:
# Libraries
library(xgboost)
library(dplyr)
library(Matrix)
library(data.table)
library(Ckmeans.1d.dp)
library(e1071)
library(caret)

# Set Seed
set.seed(1066)

# Name of Run
NAME <- "eg_1"

** Currently remove date features because of large number of factors **  

In [63]:
# Read data
# remove id and date_first_booking as they are not relevant
# CURRENTLY REMOVES DATE PARAMETERS AS WELL
dat_raw <- readRDS("../Data/users_PP.RDS") %>%
    na.omit()

dat <- dat_raw %>%
    select(-c(id,dataset,age_cln,age_cln2)) %>%
    data.table(keep.rownames = F)

In [64]:
# One-hot encoding  
# https://cran.r-project.org/web/packages/xgboost/vignettes/discoverYourData.html
sparse_dat <- sparse.model.matrix(country_destination ~ . -1, data = dat)

# Find the training set
sparse_tr <- sparse_dat[dat_raw$dataset == "train",]
tr <- dat[dat_raw$dataset == "train",]

In [65]:
# Parameter search using Cross validation
# http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees
# Currently using low number of rounds to test

# set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
    nrounds = 10,                    # Iterations building each XGB model (100)
    max_depth = c(2, 3, 4, 5),       # Maximum tree depth c(2, 4, 6, 8, 10)
    eta = c(0.01, 0.005, 0.001),     # Learning rate c(0.01, 0.001, 0.0001)
    gamma = 1,                       # Min loss reduction required to make a partition on leaf node [0:inf]
    colsample_bytree = 0.3,          # proportion of features used in each tree c(0.3, 0.5, 0.7)
    min_child_weight = 1
)

# trainControl creates settings for caret::train
xgb_trcontrol_1 = trainControl(
    method = "cv",          # Cross validation
    number = 3,             # number of folds (5)
    verboseIter = TRUE,
    returnData = FALSE,
    returnResamp = "all",   # How many summary stats to save # save losses across all models
    allowParallel = TRUE
)

In [None]:
# Train XGboost
# "Kappa" metric used for evaluation
# xgb_train_1 = train(
#     x = sparse_tr,
#     y = tr$country_destination,
#     trControl = xgb_trcontrol_1,
#     tuneGrid = xgb_grid_1,
#     method = "xgbTree", 
#     metric = "Kappa"
# )

In [78]:
xgb <- xgboost(data = sparse_tr, 
               label = as.numeric(tr$country_destination) - 1, 
               eta = 0.1,
               max_depth = 9, 
               nround=25, 
               subsample = 0.5,
               colsample_bytree = 0.5,
               eval_metric = "merror",
               objective = "multi:softprob",
               num_class = 12,
               nthread = 3
)

[0]	train-merror:0.464063
[1]	train-merror:0.459373
[2]	train-merror:0.454256
[3]	train-merror:0.451848
[4]	train-merror:0.449214
[5]	train-merror:0.448111
[6]	train-merror:0.447270
[7]	train-merror:0.446568
[8]	train-merror:0.446041
[9]	train-merror:0.445189
[10]	train-merror:0.444449
[11]	train-merror:0.443721
[12]	train-merror:0.443521
[13]	train-merror:0.442567
[14]	train-merror:0.442179
[15]	train-merror:0.440661
[16]	train-merror:0.439921
[17]	train-merror:0.439056
[18]	train-merror:0.438855
[19]	train-merror:0.437864
[20]	train-merror:0.437476
[21]	train-merror:0.436711
[22]	train-merror:0.435971
[23]	train-merror:0.435770
[24]	train-merror:0.435168


In [80]:
saveRDS(xgb, paste0("./Models/xgb_model", NAME, ".RDS"))

In [68]:
# Evaluating importance of features to the model
# importance <- xgb.importance(sparse_tr@Dimnames[[2]], 
#                              #model = xgb$finalModel, 
#                              model = xgb,
#                              data = sparse_tr, 
#                              label = as.numeric(tr$country_destination)
#                             )
# xgb.plot.importance(importance_matrix = importance)

In [69]:
# str(xgb$finalModel)
# xgb$results$Kappa

In [70]:
# scatter plot of the Kappa against max_depth and eta
# ggplot(xgb$results, aes(x = as.factor(eta), y = max_depth, size = Kappa, color = Kappa)) + 
#    geom_point() + 
#    theme_bw() + 
#    scale_size_continuous(guide = "none")

# Predictions
We use the "predictions" function to evaluate our model on both the training set and set set. We see from the below that the probabilities lead to NDF and US always being predicted. The accuracy at this point is also quite low. 

In [81]:
dataset <- dat_raw$dataset
target <- dat$country_destination
# save(xgb, sparse_dat, dataset, target, file = "test.RData")

In [82]:
source("Predictions.R")
# pred <- predictions(xgb$finalModel, sparse_dat, dat_raw$dataset, dat$country_destination)
pred <- predictions(xgb, sparse_dat, dat_raw$dataset, dat$country_destination)

pred$pred_tr %>% table()
pred$acc_tr

pred$pred_ts %>% table()
pred$acc_ts

 logi [1:133869] TRUE TRUE TRUE TRUE FALSE TRUE ...
NULL
   AU    CA    DE    ES    FR    GB    IT   NDF    NL other    PT    US 
  279   657   535  1069  2267  1104  1241 37778   373  4632   102 29700 


.
  NDF other    US 
48703     1 31033 

.
  NDF    US 
16064 10301 

## Submission
https://www.kaggle.com/indradenbakker/airbnb-recruiting-new-user-bookings/rscript-0-86547/discussion  
As per the example script above this submission file currently just takes the top 5 predictions in order as its submission file.

In [None]:
# Generate predictions on competition test set. 
# compare prediction to results
source("Generate_submission.R")
sparse_test <- sparse_dat[dat_raw$set == "test_external",]
id <- as.character(dat_raw[dat_raw$dataset == "test_external", "id"])

str(sparse_test)
#final <- submission(xgb$finalModel, sparse_test, id, paste0("xgb", NAME))
final <- submission(xgb, sparse_test, id, paste0("xgb", NAME))

Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  ..@ i       : int [1:3448523] 0 1 2 3 4 5 6 7 8 9 ...
  ..@ p       : int [1:166] 0 133869 133874 134989 136694 148427 179555 209911 229539 241537 ...
  ..@ Dim     : int [1:2] 133869 165
  ..@ Dimnames:List of 2
  .. ..$ : chr [1:133869] "1" "2" "3" "4" ...
  .. ..$ : chr [1:165] "X" "age_bucket0-4" "age_bucket100+" "age_bucket15-19" ...
  ..@ x       : num [1:3448523] 2 3 4 7 8 9 11 12 14 15 ...
  ..@ factors : list()


In [88]:
#name <- paste0("xgb_", NAME)
#save(xgb, sparse_test, id, name, file = "test.RData")

str(final)

ERROR: Error in str(final): object 'final' not found
