# XGBoost

* Performs xgboost on training data. 
* Performs on a prebuilt set of parameters to quickly iterate and test for improvements. 

** Dataset must be in the following format:  **
1. Test and Training data must be combined into one data frame
2. A variable called "dataset" with factors "test" and "train" must denote which  dataset each observation comes from. 
3. Preferrably no NA values (replace with a character or -1 etc.) 
4. Any features which are numeric are treated as numeric, all other features treated as factors. (numeric means decisions can be made continuously)

In [None]:
# Libraries
library(xgboost)
library(dplyr)
library(Matrix)
library(data.table)
library(Ckmeans.1d.dp)
library(e1071)
library(caret)
library(car)
library(readr)

# Set Seed
set.seed(1066)

NAME <- "BASIC_1" # Name of Run (used for save file names)
DATAPATH <- "../Data/users_FE.RDa" # Path to preprocessed data
COMPUTE_IMPORTANCE <- FALSE # Toggle computing importance or not because it is computationally expensive

In [None]:
# Read data
df_all <- read_rds("../Data/users_FE.RDa")

# Convert all factors into characters (so NAs can be replaced)
i <- sapply(df_all, is.factor)
df_all[i] <- lapply(df_all[i], as.character)
# Ensure there are no NA values (makes sparse matrix method fail)
df_all[is.na(df_all)] <- -1

# Extract dataset index and output labels from data
labels <- df_all$country_destination
set <- df_all$dataset

# Remove unwanted features if present
features_rm <- colnames(dat_all) %in% c("id", "dataset", "first_browser", "age_cln", "age_cln2", "date_first_booking, X")
dat <- dat_all[, !features_rm] %>%
    data.table(keep.rownames = F)

In [None]:
# One - hot encoding 
sparse_dat <- suppressWarnings( # Suppress warnings used to prevent warning messages about factor conversion
    sparse.model.matrix( ~ . -1, data = df_all[,-1])
)

# Split into training and test set
sparse_tr <- sparse_dat[set == "train",]
sparse_ts <- sparse_dat[set == "test",]

In [None]:
# XGB requires labels to be numeric indexed at 0. 
y <- recode(labels,"'NDF'=0; 'US'=1; 'other'=2; 'FR'=3; 'CA'=4; 'GB'=5; 'ES'=6; 'IT'=7; 'PT'=8; 'NL'=9; 'DE'=10; 'AU'=11")

# train xgboost
model <- xgboost(data = sparse_tr, #data.matrix(X[,-1]), 
               label = y[set == "train"], 
               eta = 0.1,
               max_depth = 9, 
               nround=25, 
               subsample = 0.5,
               colsample_bytree = 0.5,
               eval_metric = "merror",
               objective = "multi:softprob",
               num_class = 12,
               nthread = 3
)

In [None]:
str(sparse_tr)

In [None]:
# Plots performance metrics if caret was used 
if(COMPUTE_IMPORTANCE){
# Evaluating importance of features to the model
importance <- xgb.importance(sparse_tr@Dimnames[[2]], 
                             model = model, 
                             data = sparse_tr, 
                             label = labels[set == "train"]
                            )
xgb.plot.importance(importance_matrix = head(importance,30))
}

## Submission
https://www.kaggle.com/indradenbakker/airbnb-recruiting-new-user-bookings/rscript-0-86547/discussion  
As per the example script above this submission file currently just takes the top 5 predictions in order as its submission file.

In [None]:
# Generate predictions on competition test set. 
# compare prediction to results
source("Generate_submission.R")
final <- submission(model, sparse_ts, df_all[set == "test", "id"], NAME)

head(final$df,20)
head(final$file,20)