In [33]:
# install.packages('pROC')
# install.packages('xgboost')

In [3]:
library(data.table)
library(caret)
library(pROC)
library(plyr)
library(boot)
library(tidyverse)
library(xgboost)

In [4]:
batch_size = 500000
split = 0.7

In [5]:
add_batch_and_factor <- function(data){
    
    data$batch = ceiling((data$time * 10000) / batch_size) - 1
    
    data <- transform(
        data,
        time=as.numeric(time),
        signal=as.numeric(signal),
        open_channels=as.integer(open_channels),
        batch=as.factor(batch)
    )

    return(data)
}

In [6]:
train_data = read.csv('train_clean.csv')

In [7]:
train_data = add_batch_and_factor(train_data)
sapply(train_data, class)

In [8]:
train_data[sample(nrow(train_data), 5), ]

Unnamed: 0,time,signal,open_channels,batch
1251238,125.1238,-1.3132,1,2
1977393,197.7393,-1.4323,1,3
1578636,157.8636,0.6499,3,3
1270814,127.0814,-1.3927,1,2
2236367,223.6367,2.7507,7,4


In [9]:
unique(train_data$batch)
label_levels = train_data$open_channels

In [10]:
train_data_features = train_data[c('signal', 'batch', 'time')]
train_data_labels = train_data['open_channels']

In [11]:
train_data_features_matrix = data.matrix(train_data_features)
train_data_labels_matrix =  data.matrix(train_data_labels)

In [12]:
numberOfTrainingSamples <- round(length(train_data) * split)

# training data
train_data <- train_data_features_matrix[1:numberOfTrainingSamples,]
train_labels <- train_data_labels_matrix[1:numberOfTrainingSamples]

# testing data
test_data <- train_data_features_matrix[-(1:numberOfTrainingSamples),]
test_labels <- train_data_labels_matrix[-(1:numberOfTrainingSamples)]

In [13]:
xgb.train <- xgb.DMatrix(data = train_data, label = train_labels)
xgb.test <- xgb.DMatrix(data = test_data, label = test_labels)

In [14]:
params = list(
  booster="gbtree",
  eta=0.001,
  max_depth=5,
  gamma=3,
  subsample=0.75,
  colsample_bytree=1,
  objective="multi:softprob",
  eval_metric="mlogloss",
  num_class=length(unique(label_levels))
)

In [15]:
xgb.fit=xgb.train(
    params=params,
    data=xgb.train,
    nrounds=10000,
    nthreads=1,
    early_stopping_rounds=10,
    watchlist=list(val1=xgb.train,val2=xgb.test),
    verbose=0
)

In [16]:
xgb.fit

##### xgb.Booster
raw: 127.9 Kb 
call:
  xgb.train(params = params, data = xgb.train, nrounds = 10000, 
    watchlist = list(val1 = xgb.train, val2 = xgb.test), verbose = 0, 
    early_stopping_rounds = 10, nthreads = 1)
params (as set within xgb.train):
  booster = "gbtree", eta = "0.001", max_depth = "5", gamma = "3", subsample = "0.75", colsample_bytree = "1", objective = "multi:softprob", eval_metric = "mlogloss", num_class = "11", nthreads = "1", silent = "1"
xgb.attributes:
  best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
  cb.evaluation.log()
  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
    verbose = verbose)
# of features: 3 
niter: 63
best_iteration : 53 
best_ntreelimit : 53 
best_score : 2.37085 
nfeatures : 3 
evaluation_log:
    iter val1_mlogloss val2_mlogloss
       1      2.397052      2.383855
       2      2.396229      2.383826
---                                 
      62      2.311935      2.380293
      63

In [20]:
# Predict outcomes with the test data
xgb.pred = predict(xgb.fit,test_data,reshape=T)
xgb.pred = as.data.frame(xgb.pred)
colnames(xgb.pred) = unique(label_levels)

In [21]:
head(xgb.pred)

0,1,3,2,10,9,8,7,6,5,4
0.09787406,0.09022623,0.09021507,0.09020449,0.09021686,0.09015597,0.09026237,0.09020745,0.09021912,0.09020568,0.09021274
0.09787406,0.09022623,0.09021507,0.09020449,0.09021686,0.09015597,0.09026237,0.09020745,0.09021912,0.09020568,0.09021274
0.09787406,0.09022623,0.09021507,0.09020449,0.09021686,0.09015597,0.09026237,0.09020745,0.09021912,0.09020568,0.09021274
0.09787406,0.09022623,0.09021507,0.09020449,0.09021686,0.09015597,0.09026237,0.09020745,0.09021912,0.09020568,0.09021274
0.09787406,0.09022623,0.09021507,0.09020449,0.09021686,0.09015597,0.09026237,0.09020745,0.09021912,0.09020568,0.09021274
0.09787406,0.09022623,0.09021507,0.09020449,0.09021686,0.09015597,0.09026237,0.09020745,0.09021912,0.09020568,0.09021274


In [22]:
# Use the predicted label with the highest probability
xgb.pred$prediction = apply(xgb.pred,1,function(x) colnames(xgb.pred)[which.max(x)])
xgb.pred$label = unique(label_levels)[test_labels+1]

In [23]:
result = sum(xgb.pred$prediction==xgb.pred$label)/nrow(xgb.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result)))

[1] "Final Accuracy = 24.80%"
