In [33]:
# install.packages('pROC')
# install.packages('xgboost')

In [110]:
library(data.table)
library(caret)
library(pROC)
library(plyr)
library(boot)
library(tidyverse)
library(xgboost)

In [118]:
batch_size = 500000
split = 0.7

In [139]:
add_batch_and_factor <- function(data){
    
    data$batch = ceiling((data$time * 10000) / batch_size) - 1
    
    data <- transform(
        data,
        time=as.numeric(time),
        signal=as.numeric(signal),
        open_channels=as.integer(open_channels),
        batch=as.factor(batch)
    )

    return(data)
}

In [149]:
train_data = read.csv('train_clean.csv')

In [150]:
train_data = add_batch_and_factor(train_data)
sapply(train_data, class)

In [151]:
train_data[sample(nrow(train_data), 5), ]

Unnamed: 0,time,signal,open_channels,batch
4690005,469.0005,4.99116,8,9
523963,52.3963,-2.441979,0,1
2718650,271.865,-0.3969,2,5
1152873,115.2873,-1.2114,1,2
4938580,493.858,2.981873,7,9


In [152]:
unique(train_data$batch)
label_levels = train_data$open_channels

In [153]:
train_data_features = train_data[c('signal', 'batch', 'time')]
train_data_labels = train_data['open_channels']

In [154]:
train_data_features_matrix = data.matrix(train_data_features)
train_data_labels_matrix =  data.matrix(train_data_labels)

In [155]:
numberOfTrainingSamples <- round(length(train_data) * split)

# training data
train_data <- train_data_features_matrix[1:numberOfTrainingSamples,]
train_labels <- train_data_labels_matrix[1:numberOfTrainingSamples]

# testing data
test_data <- train_data_features_matrix[-(1:numberOfTrainingSamples),]
test_labels <- train_data_labels_matrix[-(1:numberOfTrainingSamples)]

In [172]:
xgb.train <- xgb.DMatrix(data = train_data, label = train_labels)
xgb.test <- xgb.DMatrix(data = test_data, label = test_labels)

In [176]:
params = list(
  booster="gbtree",
  eta=0.001,
  max_depth=5,
  gamma=3,
  subsample=0.75,
  colsample_bytree=1,
  objective="multi:softprob",
  eval_metric="mlogloss",
  num_class=length(unique(label_levels))
)

In [177]:
xgb.fit=xgb.train(
    params=params,
    data=xgb.train,
    nrounds=10000,
    nthreads=1,
    early_stopping_rounds=10,
    watchlist=list(val1=xgb.train,val2=xgb.test),
    verbose=0
)

In [178]:
xgb.fit

##### xgb.Booster
raw: 134 Kb 
call:
  xgb.train(params = params, data = xgb.train, nrounds = 10000, 
    watchlist = list(val1 = xgb.train, val2 = xgb.test), verbose = 0, 
    early_stopping_rounds = 10, nthreads = 1)
params (as set within xgb.train):
  booster = "gbtree", eta = "0.001", max_depth = "5", gamma = "3", subsample = "0.75", colsample_bytree = "1", objective = "multi:softprob", eval_metric = "mlogloss", num_class = "11", nthreads = "1", silent = "1"
xgb.attributes:
  best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
  cb.evaluation.log()
  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
    verbose = verbose)
# of features: 3 
niter: 66
best_iteration : 56 
best_ntreelimit : 56 
best_score : 2.370809 
nfeatures : 3 
evaluation_log:
    iter val1_mlogloss val2_mlogloss
       1      2.396124      2.383826
       2      2.395287      2.383817
---                                 
      65      2.308259      2.380141
      66 