# XGBoost

* Performs xgboost on training data. 
* Iterates over parameters with cross validation
* Currently ignoring date parameters due to large number of factors. Waiting for preprocessing steps to improve. 
* Warning: takes a long time to cross validate

In [1]:
# Libraries
library(xgboost)
library(dplyr)
library(Matrix)
library(data.table)
library(Ckmeans.1d.dp)
library(e1071)
library(caret)

# Set Seed
set.seed(1066)

NAME <- "eg_1" # Name of Run (used for save file names)
DATAPATH <- "../Data/users_PP.RDS" # Path to preprocessed data


Attaching package: 'dplyr'

The following object is masked from 'package:xgboost':

    slice

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, last

Loading required package: lattice
Loading required package: ggplot2


** Currently remove date features because of large number of factors **  

In [2]:
# Read data
# remove id and date_first_booking as they are not relevant
# CURRENTLY REMOVES DATE PARAMETERS AS WELL
dat_raw <- readRDS(DATAPATH) %>%
    na.omit()

# Remove unwanted features
features_rm <- colnames(dat_raw) %in% c("id", "dataset", "first_browser", "age_cln", "age_cln2", "date_first_booking", "X")
dat <- dat_raw[, !features_rm] %>%
    data.table(keep.rownames = F)

In [3]:
# One-hot encoding  
# https://cran.r-project.org/web/packages/xgboost/vignettes/discoverYourData.html
sparse_dat <- sparse.model.matrix(country_destination ~ . -1, data = dat)

# Find the training set
sparse_tr <- sparse_dat[dat_raw$dataset == "train",]
tr <- dat[dat_raw$dataset == "train",]

In [52]:
# One hot with data frames instead
df_all <- dat
# one-hot-encoding features
ohe_feats = c('gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser')
dummies <- dummyVars(~ gender + signup_method + signup_flow + language + affiliate_channel + affiliate_provider + 
                     first_affiliate_tracked + signup_app + first_device_type, data = df_all)
df_all_ohe <- as.data.frame(predict(dummies, newdata = df_all))
df_all_combined <- cbind(df_all[,(colnames(df_all) %in% ohe_feats)], df_all_ohe)

ERROR: Error in data.frame(..., check.names = FALSE): arguments imply differing number of rows: 29, 133869


In [71]:
ii <- colnames(df_all) %in% ohe_feats
str(df_all)

Classes 'data.table' and 'data.frame':	133869 obs. of  29 variables:
 $ age_bucket             : Factor w/ 20 levels "0-4","100+","15-19",..: 7 12 8 9 9 11 7 9 7 7 ...
 $ dac_year               : int  2011 2010 2011 2010 2010 2010 2010 2010 2010 2010 ...
 $ dac_month              : int  5 9 12 1 1 1 1 1 1 1 ...
 $ dac_day                : int  25 28 5 2 3 4 4 5 5 7 ...
 $ dac_week               : int  21 39 49 0 1 1 1 1 1 1 ...
 $ dac_yearweek           : int  201121 201039 201149 201000 201001 201001 201001 201001 201001 201001 ...
 $ dac_yearmonth          : int  201105 201009 201112 201001 201001 201001 201001 201001 201001 201001 ...
 $ dac_yearmonthday       : int  20110525 20100928 20111205 20100102 20100103 20100104 20100104 20100105 20100105 20100107 ...
 $ dac_yearmonthweek      : int  20110521 20100939 20111249 20100100 20100101 20100101 20100101 20100101 20100101 20100101 ...
 $ tfa_year               : int  2009 2009 2009 2010 2010 2010 2010 2010 2010 2010 ...
 $ tfa_month 

In [None]:
# Train xgboost with specific settings
model <- xgboost(data = sparse_tr, 
               label = as.numeric(tr$country_destination) - 1, 
               eta = 0.1,
               max_depth = 9, 
               nround=25, 
               subsample = 0.5,
               colsample_bytree = 0.5,
               eval_metric = "merror",
               objective = "multi:softprob",
               num_class = 12,
               nthread = 3
)

In [None]:
# Parameter search using Cross validation
# http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees
# Currently using low number of rounds to test

# set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
    nrounds = 10,                    # Iterations building each XGB model (100)
    max_depth = c(2, 3, 4, 5),       # Maximum tree depth c(2, 4, 6, 8, 10)
    eta = c(0.01, 0.005, 0.001),     # Learning rate c(0.01, 0.001, 0.0001)
    gamma = 1,                       # Min loss reduction required to make a partition on leaf node [0:inf]
    colsample_bytree = 0.3,          # proportion of features used in each tree c(0.3, 0.5, 0.7)
    min_child_weight = 1
)

# trainControl creates settings for caret::train
xgb_trcontrol_1 = trainControl(
    method = "cv",          # Cross validation
    number = 3,             # number of folds (5)
    verboseIter = TRUE,
    returnData = FALSE,
    returnResamp = "all",   # How many summary stats to save # save losses across all models
    allowParallel = TRUE
)

In [None]:
# Train XGboost
# "Kappa" metric used for evaluation
xgb_train_1 = train(
    x = sparse_tr,
    y = tr$country_destination,
    trControl = xgb_trcontrol_1,
    tuneGrid = xgb_grid_1,
    method = "xgbTree", 
    metric = "Kappa"
)

saveRDS(xgb_train_1, paste0("./Models/", NAME, ".RDS"))
model <- xgb_train_1$finalModel

In [None]:
# Plots performance metrics if caret was used 
if(!is.null(xgb_train_1$results)){

# Evaluating importance of features to the model
importance <- xgb.importance(sparse_tr@Dimnames[[2]], 
                             model = model, 
                             data = sparse_tr, 
                             label = as.numeric(tr$country_destination)
                            )
xgb.plot.importance(importance_matrix = head(importance,30))
}

In [None]:
if(!is.null(xgb_train_1$results)){
    # scatter plot of the Kappa against max_depth and eta
    ggplot(xgb_train_1$results, aes(x = as.factor(eta), y = max_depth, size = Kappa, color = Kappa)) + 
        geom_point() + 
        theme_bw() + 
        scale_size_continuous(guide = "none")
}

# Predictions
We use the "predictions" function to evaluate our model on both the training set and set set. We see from the below that the probabilities lead to NDF and US always being predicted. The accuracy at this point is also quite low. 

In [None]:
source("Predictions.R")
pred <- predictions(model, sparse_dat, dat_raw$dataset, dat$country_destination)

pred$pred_tr %>% table()
pred$acc_tr

pred$pred_ts %>% table()
pred$acc_ts

## Submission
https://www.kaggle.com/indradenbakker/airbnb-recruiting-new-user-bookings/rscript-0-86547/discussion  
As per the example script above this submission file currently just takes the top 5 predictions in order as its submission file.

In [None]:
# Generate predictions on competition test set. 
# compare prediction to results
source("Generate_submission.R")
sparse_test <- sparse_dat[dat_raw$dataset == "test_external",]
id <- dat_raw[dat_raw$dataset == "test_external", "id"]
final <- submission(model, sparse_test, id, NAME)

head(final$df,20)
head(final$file,20)

In [None]:
save(model, sparse_dat, dat_raw, dat, file = "test.RData")