# XGBoost

* Performs xgboost on training data. 
* Performs on a prebuilt set of parameters to quickly iterate and test for improvements. 

In [1]:
# Libraries
library(xgboost)
library(dplyr)
library(Matrix)
library(data.table)
library(Ckmeans.1d.dp)
library(e1071)
library(caret)
library(car)
library(readr)
library(stringr)

# Set Seed
set.seed(1066)

NAME <- "BASIC_1" # Name of Run (used for save file names)
DATAPATH <- "../Data/users_FE.RDa" # Path to preprocessed data
COMPUTE_IMPORTANCE <- FALSE # Toggle computing importance or not because it is computationally expensive


Attaching package: 'dplyr'

The following object is masked from 'package:xgboost':

    slice

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, last

Loading required package: lattice
Loading required package: ggplot2


** Currently remove date features because of large number of factors **  

In [2]:
# load data
df_train = read_csv("../Data/train_users_2.csv")
df_test = read_csv("../Data/test_users.csv")
labels = df_train['country_destination']
df_train = df_train[-grep('country_destination', colnames(df_train))]

# combine train and test data
df_all = rbind(df_train,df_test)
# remove date_first_booking
df_all = df_all[-c(which(colnames(df_all) %in% c('date_first_booking')))]
# replace missing values
df_all[is.na(df_all)] <- -1

# split date_account_created in year, month and day
dac = as.data.frame(str_split_fixed(df_all$date_account_created, '-', 3))
df_all['dac_year'] = dac[,1]
df_all['dac_month'] = dac[,2]
df_all['dac_day'] = dac[,3]
df_all = df_all[,-c(which(colnames(df_all) %in% c('date_account_created')))]

# split timestamp_first_active in year, month and day
df_all['tfa_year'] = substring(as.character(df_all$timestamp_first_active), 1, 4)
df_all['tfa_month'] = substring(as.character(df_all$timestamp_first_active), 5, 6)
df_all['tfa_day'] = substring(as.character(df_all$timestamp_first_active), 7, 8)
df_all = df_all[,-c(which(colnames(df_all) %in% c('timestamp_first_active')))]

# clean Age by removing values
df_all[df_all$age < 14 | df_all$age > 100,'age'] <- -1

In [16]:
df_raw <- read_csv("../Data/users_PP.csv") 
labels <- df_raw$country_destination

df_train <- filter(df_raw, dataset == "train")
df_test <- filter(df_raw, dataset == "test")

df_all <- df_raw %>% select(-X, - dataset, -age_cln, -age_cln2)

dim(df_all)
dim(df_raw)

In [17]:
# One-hot encoding  
# https://cran.r-project.org/web/packages/xgboost/vignettes/discoverYourData.html
sparse_dat <- sparse.model.matrix(country_destination ~ . -1, data = df_all)

dim(sparse_dat)
# Find the training set
sparse_tr <- sparse_dat[df_all$id %in% df_train$id,]
sparse_ts <- sparse_dat[df_all$id %in% df_test$id,]

In sparse.model.matrix(country_destination ~ . - 1, data = df_all): variable 'first_browser' converted to a factor

ERROR: Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): logical subscript too long (275547, should be 133869)


ERROR: Error in intI(i, n = x@Dim[1], dn[[1]], give.dn = FALSE): logical subscript too long (275547, should be 133869)


In [4]:
# split train and test
y <- recode(labels$country_destination,
            "'NDF'=0; 'US'=1; 'other'=2; 'FR'=3; 'CA'=4; 'GB'=5; 'ES'=6; 'IT'=7; 'PT'=8; 'NL'=9; 'DE'=10; 'AU'=11")

# train xgboost
xgb <- xgboost(data = sparse_tr, #data.matrix(X[,-1]), 
               label = y, 
               eta = 0.1,
               max_depth = 9, 
               nround=25, 
               subsample = 0.5,
               colsample_bytree = 0.5,
               eval_metric = "merror",
               objective = "multi:softprob",
               num_class = 12,
               nthread = 3
)

[0]	train-merror:0.410670
[1]	train-merror:0.383802
[2]	train-merror:0.389298
[3]	train-merror:0.384205
[4]	train-merror:0.382744
[5]	train-merror:0.383067
[6]	train-merror:0.381362
[7]	train-merror:0.378101
[8]	train-merror:0.374957
[9]	train-merror:0.373083
[10]	train-merror:0.371870
[11]	train-merror:0.369794
[12]	train-merror:0.366529
[13]	train-merror:0.365808
[14]	train-merror:0.364416
[15]	train-merror:0.362308
[16]	train-merror:0.361947
[17]	train-merror:0.361137
[18]	train-merror:0.360832
[19]	train-merror:0.360237
[20]	train-merror:0.360298
[21]	train-merror:0.359535
[22]	train-merror:0.358883
[23]	train-merror:0.357979
[24]	train-merror:0.357707


## Submission
https://www.kaggle.com/indradenbakker/airbnb-recruiting-new-user-bookings/rscript-0-86547/discussion  
As per the example script above this submission file currently just takes the top 5 predictions in order as its submission file.

In [5]:
# Generate predictions on competition test set. 
# compare prediction to results
source("Generate_submission.R")
final <- submission(xgb, sparse_ts, df_test$id, "temp")

head(final$df,20)
head(final$file,20)

Unnamed: 0,V1,V2,V3,V4,V5,id
V1,NDF,US,other,ES,FR,5uwns89zht
V2,NDF,US,other,FR,IT,jtl0dijy2j
V3,NDF,US,other,FR,IT,xx0ulgorjt
V4,NDF,US,other,FR,IT,6c6puo6ix0
V5,NDF,US,other,FR,IT,czqhjk3yfe
V6,US,NDF,other,FR,IT,szx28ujmhf
V7,NDF,US,other,FR,ES,guenkfjcbq
V8,NDF,US,other,FR,IT,tkpq0mlugk
V9,NDF,US,other,FR,IT,3xtgd5p9dn
V10,NDF,US,other,FR,IT,md9aj22l5a


Unnamed: 0,id,country
1,5uwns89zht,NDF
2,5uwns89zht,US
3,5uwns89zht,other
4,5uwns89zht,ES
5,5uwns89zht,FR
6,jtl0dijy2j,NDF
7,jtl0dijy2j,US
8,jtl0dijy2j,other
9,jtl0dijy2j,FR
10,jtl0dijy2j,IT
