# XGBoost

* Performs xgboost on training data. 
* Performs on a prebuilt set of parameters to quickly iterate and test for improvements. 

** Dataset must be in the following format:  **
1. Test and Training data must be combined into one data frame
2. A variable called "dataset" with factors "test" and "train" must denote which  dataset each observation comes from. 
3. Preferrably no NA values (replace with a character or -1 etc.) 
4. Any features which are numeric are treated as numeric, all other features treated as factors. (numeric means decisions can be made continuously)

In [2]:
# Libraries
library(xgboost)
library(dplyr)
library(Matrix)
library(data.table)
library(Ckmeans.1d.dp)
library(e1071)
library(caret)
library(car)
library(readr)

# Set Seed
set.seed(1066)

NAME <- "BASIC_1" # Name of Run (used for save file names)
DATAPATH <- "../Data/users_FE.RDa" # Path to preprocessed data
COMPUTE_IMPORTANCE <- FALSE # Toggle computing importance or not because it is computationally expensive


Attaching package: 'dplyr'

The following object is masked from 'package:xgboost':

    slice

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, last

Loading required package: lattice
Loading required package: ggplot2


In [3]:
# Read data
df_all <- read_rds("../Data/users_FE.RDa")

# Convert all factors into characters (so NAs can be replaced)
i <- sapply(df_all, is.factor)
df_all[i] <- lapply(df_all[i], as.character)
# Ensure there are no NA values (makes sparse matrix method fail)
df_all[is.na(df_all)] <- -1

# Extract dataset index and output labels from data
labels <- df_all$country_destination
set <- df_all$dataset

# Remove unwanted features if present
features_rm <- colnames(df_all) %in% c("id", "dataset", "first_browser", "age_cln", "age_cln2", "date_first_booking, X")
dat <- df_all[, !features_rm] %>%
    data.table(keep.rownames = F)

In [4]:
# One - hot encoding 
sparse_dat <- suppressWarnings( # Suppress warnings used to prevent warning messages about factor conversion
    sparse.model.matrix( ~ . -1, data = df_all[,-1])
)

# Split into training and test set
sparse_tr <- sparse_dat[set == "train",]
sparse_ts <- sparse_dat[set == "test",]

In [5]:
# Parameter search using Cross validation
# http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees
# Currently using low number of rounds to test

# set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
    nrounds = 10,                    # Iterations building each XGB model (100)
    max_depth = c(2, 3, 4, 5),       # Maximum tree depth c(2, 4, 6, 8, 10)
    eta = c(0.01, 0.005, 0.001),     # Learning rate c(0.01, 0.001, 0.0001)
    gamma = 1,                       # Min loss reduction required to make a partition on leaf node [0:inf]
    colsample_bytree = 0.3,          # proportion of features used in each tree c(0.3, 0.5, 0.7)
    min_child_weight = 1
)

# trainControl creates settings for caret::train
xgb_trcontrol_1 = trainControl(
    method = "cv",          # Cross validation
    number = 3,             # number of folds (5)
    verboseIter = TRUE,
    returnData = FALSE,
    returnResamp = "all",   # How many summary stats to save # save losses across all models
    allowParallel = TRUE
)

In [6]:
# XGB requires labels to be numeric indexed at 0. 
#y <- recode(labels,"'NDF'=0; 'US'=1; 'other'=2; 'FR'=3; 'CA'=4; 'GB'=5; 'ES'=6; 'IT'=7; 'PT'=8; 'NL'=9; 'DE'=10; 'AU'=11")

# train xgboost
# "Kappa" metric used for evaluation
 xgb = train(
     x = sparse_tr,
     y = labels[set == "train"],
     trControl = xgb_trcontrol_1,
     tuneGrid = xgb_grid_1,
     method = "xgbTree", 
     metric = "Kappa"
 )

model <- xgb$finalModel

Loading required package: plyr
------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------

Attaching package: 'plyr'

The following objects are masked from 'package:dplyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize



+ Fold1: eta=0.001, max_depth=2, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
- Fold1: eta=0.001, max_depth=2, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
+ Fold1: eta=0.001, max_depth=3, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
- Fold1: eta=0.001, max_depth=3, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
+ Fold1: eta=0.001, max_depth=4, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
- Fold1: eta=0.001, max_depth=4, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
+ Fold1: eta=0.001, max_depth=5, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
- Fold1: eta=0.001, max_depth=5, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
+ Fold1: eta=0.005, max_depth=2, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
- Fold1: eta=0.005, max_depth=2, gamma=1, colsample_bytree=0.3, min_child_weight=1, nrounds=10 
+ Fold1: eta=0.005, max_depth=3, gamma=1

In [7]:
# Plots performance metrics if caret was used 
if(COMPUTE_IMPORTANCE){
# Evaluating importance of features to the model
importance <- xgb.importance(sparse_tr@Dimnames[[2]], 
                             model = model, 
                             data = sparse_tr, 
                             label = labels[set == "train"]
                            )
xgb.plot.importance(importance_matrix = head(importance,30))
}

## Submission
https://www.kaggle.com/indradenbakker/airbnb-recruiting-new-user-bookings/rscript-0-86547/discussion  
As per the example script above this submission file currently just takes the top 5 predictions in order as its submission file.

In [8]:
str(model)
#model <- xgb$finalModel

List of 6
 $ handle     :Class 'xgb.Booster.handle' <externalptr> 
 $ raw        : raw [1:39380] 00 00 00 3f ...
 $ xNames     : chr [1:382] "age_cln" "age_cln2" "age_bucket-1" "age_bucket0-4" ...
 $ problemType: chr "Classification"
 $ tuneValue  :'data.frame':	1 obs. of  6 variables:
  ..$ nrounds         : num 10
  ..$ max_depth       : num 2
  ..$ eta             : num 0.001
  ..$ gamma           : num 1
  ..$ colsample_bytree: num 0.3
  ..$ min_child_weight: num 1
 $ obsLevels  : chr [1:12] "AU" "CA" "DE" "ES" ...
 - attr(*, "class")= chr "xgb.Booster"


In [9]:
# Generate predictions on competition test set. 
# compare prediction to results
source("Generate_submission.R")
final <- submission(model, sparse_ts, df_all[set == "test", "id"], NAME)

head(final$df,20)
head(final$file,20)

Unnamed: 0,V1,V2,V3,V4,V5,id
V1,NDF,US,other,FR,IT,5uwns89zht
V2,NDF,US,other,FR,IT,jtl0dijy2j
V3,NDF,US,other,FR,IT,xx0ulgorjt
V4,NDF,US,other,FR,IT,6c6puo6ix0
V5,NDF,US,other,FR,IT,czqhjk3yfe
V6,NDF,US,other,FR,IT,szx28ujmhf
V7,NDF,US,other,FR,IT,guenkfjcbq
V8,NDF,US,other,FR,IT,tkpq0mlugk
V9,NDF,US,other,FR,IT,3xtgd5p9dn
V10,NDF,US,other,FR,IT,md9aj22l5a


Unnamed: 0,id,country
1,5uwns89zht,NDF
2,5uwns89zht,US
3,5uwns89zht,other
4,5uwns89zht,FR
5,5uwns89zht,IT
6,jtl0dijy2j,NDF
7,jtl0dijy2j,US
8,jtl0dijy2j,other
9,jtl0dijy2j,FR
10,jtl0dijy2j,IT
