# XGBoost

* Performs xgboost on training data. 
* Performs on a prebuilt set of parameters to quickly iterate and test for improvements. 

** Dataset must be in the following format:  **
1. Test and Training data must be combined into one data frame
2. A variable called "dataset" with factors "test" and "train" must denote which  dataset each observation comes from. 
3. Preferrably no NA values (replace with a character or -1 etc.) 
4. Any features which are numeric are treated as numeric, all other features treated as factors. (numeric means decisions can be made continuously)

In [1]:
# Libraries
library(xgboost)
library(dplyr)
library(Matrix)
library(data.table)
library(Ckmeans.1d.dp)
library(e1071)
library(caret)
library(car)
library(readr)

# Set Seed
set.seed(1066)

NAME <- "BASIC_1" # Name of Run (used for save file names)
DATAPATH <- "../Data/users_FE.RDa" # Path to preprocessed data
COMPUTE_IMPORTANCE <- FALSE # Toggle computing importance or not because it is computationally expensive


Attaching package: 'dplyr'

The following object is masked from 'package:xgboost':

    slice

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, last

Loading required package: lattice
Loading required package: ggplot2


In [2]:
# Read data
df_all <- read_rds("../Data/users_FE.RDa")

# Convert all factors into characters (so NAs can be replaced)
i <- sapply(df_all, is.factor)
df_all[i] <- lapply(df_all[i], as.character)
# Ensure there are no NA values (makes sparse matrix method fail)
df_all[is.na(df_all)] <- -1

# Extract dataset index and output labels from data
labels <- df_all$country_destination
set <- df_all$dataset

# Remove unwanted features if present
features_rm <- colnames(dat_all) %in% c("id", "dataset", "first_browser", "age_cln", "age_cln2", "date_first_booking, X")
dat <- dat_all[, !features_rm] %>%
    data.table(keep.rownames = F)

ERROR: Error in is.data.frame(x): object 'dat_all' not found


ERROR: Error in eval(expr, envir, enclos): object 'dat_all' not found


In [3]:
# One - hot encoding 
sparse_dat <- suppressWarnings( # Suppress warnings used to prevent warning messages about factor conversion
    sparse.model.matrix( ~ . -1, data = df_all[,-1])
)

# Split into training and test set
sparse_tr <- sparse_dat[set == "train",]
sparse_ts <- sparse_dat[set == "test",]

In [4]:
# XGB requires labels to be numeric indexed at 0. 
y <- recode(labels,"'NDF'=0; 'US'=1; 'other'=2; 'FR'=3; 'CA'=4; 'GB'=5; 'ES'=6; 'IT'=7; 'PT'=8; 'NL'=9; 'DE'=10; 'AU'=11")

# train xgboost
model <- xgboost(data = sparse_tr, #data.matrix(X[,-1]), 
               label = y[set == "train"], 
               eta = 0.1,
               max_depth = 9, 
               nround=25, 
               subsample = 0.5,
               colsample_bytree = 0.5,
               eval_metric = "merror",
               objective = "multi:softprob",
               num_class = 12,
               nthread = 3
)

[0]	train-merror:0.051159
[1]	train-merror:0.062483
[2]	train-merror:0.011586
[3]	train-merror:0.000918
[4]	train-merror:0.000000
[5]	train-merror:0.000000
[6]	train-merror:0.000000
[7]	train-merror:0.000000
[8]	train-merror:0.000000
[9]	train-merror:0.000000
[10]	train-merror:0.000000
[11]	train-merror:0.000000
[12]	train-merror:0.000000
[13]	train-merror:0.000000
[14]	train-merror:0.000000
[15]	train-merror:0.000000
[16]	train-merror:0.000000
[17]	train-merror:0.000000
[18]	train-merror:0.000000
[19]	train-merror:0.000000
[20]	train-merror:0.000000
[21]	train-merror:0.000000
[22]	train-merror:0.000000
[23]	train-merror:0.000000
[24]	train-merror:0.000000


In [6]:
# Plots performance metrics if caret was used 
if(COMPUTE_IMPORTANCE){
# Evaluating importance of features to the model
importance <- xgb.importance(sparse_tr@Dimnames[[2]], 
                             model = model, 
                             data = sparse_tr, 
                             label = labels[set == "train"]
                            )
xgb.plot.importance(importance_matrix = head(importance,30))
}

## Submission
https://www.kaggle.com/indradenbakker/airbnb-recruiting-new-user-bookings/rscript-0-86547/discussion  
As per the example script above this submission file currently just takes the top 5 predictions in order as its submission file.

In [10]:
# Generate predictions on competition test set. 
# compare prediction to results
source("Generate_submission.R")
final <- submission(model, sparse_ts, df_all[set == "test", "id"], NAME)

head(final$df,20)
head(final$file,20)

Unnamed: 0,V1,V2,V3,V4,V5,id
V1,NDF,US,other,FR,DE,5uwns89zht
V2,NDF,US,other,FR,DE,jtl0dijy2j
V3,NDF,US,other,FR,DE,xx0ulgorjt
V4,NDF,US,other,FR,DE,6c6puo6ix0
V5,NDF,US,other,FR,DE,czqhjk3yfe
V6,NDF,US,other,FR,DE,szx28ujmhf
V7,NDF,US,other,FR,DE,guenkfjcbq
V8,NDF,US,other,FR,DE,tkpq0mlugk
V9,NDF,US,other,FR,DE,3xtgd5p9dn
V10,NDF,US,other,FR,DE,md9aj22l5a


Unnamed: 0,id,country
1,5uwns89zht,NDF
2,5uwns89zht,US
3,5uwns89zht,other
4,5uwns89zht,FR
5,5uwns89zht,DE
6,jtl0dijy2j,NDF
7,jtl0dijy2j,US
8,jtl0dijy2j,other
9,jtl0dijy2j,FR
10,jtl0dijy2j,DE


In [11]:
str(model)

List of 2
 $ handle:Class 'xgb.Booster.handle' <externalptr> 
 $ raw   : raw [1:580892] 00 00 00 3f ...
 - attr(*, "class")= chr "xgb.Booster"
