In [1]:
# install.packages('pROC')
# install.packages('xgboost')

In [22]:
library(data.table)
library(caret)
library(pROC)
library(plyr)
library(boot)
library(tidyverse)
library(xgboost)

In [35]:
batch_size = 500000
split = 0.7

In [57]:
add_batch_and_factor <- function(data){
    
    data$batch = ceiling((data$time * 10000) / batch_size)
    
    data <- transform(
        data,
        time=as.numeric(time),
        signal=as.numeric(signal),
        open_channels=as.numeric(open_channels),
        batch=as.factor(batch)
    )

    return(data)
}

In [58]:
data = read.csv('train_clean.csv')

In [80]:
data = add_batch_and_factor(data)
data = data[data$batch == 1]
sapply(data, class)

ERROR: Error in `$<-.data.frame`(`*tmp*`, "batch", value = numeric(0)): replacement has 0 rows, data has 5000000


In [74]:
data[sample(nrow(data), 5), ]

2025526
904433
3351142
4128343
4000294


In [75]:
label_levels = unique(data$open_channels)
label_levels

NULL

In [62]:
train_data_features = data[c('time', 'signal')]
train_data_labels = data['open_channels']

In [63]:
train_data_features_matrix = data.matrix(train_data_features)
train_data_labels_matrix =  data.matrix(train_data_labels)

In [64]:
numberOfTrainingSamples <- round(length(data) * split)

# training data
train_data <- train_data_features_matrix[1:numberOfTrainingSamples,]
train_labels <- train_data_labels_matrix[1:numberOfTrainingSamples]

# testing data
test_data <- train_data_features_matrix[-(1:numberOfTrainingSamples),]
test_labels <- train_data_labels_matrix[-(1:numberOfTrainingSamples)]

In [65]:
xgb.train <- xgb.DMatrix(data = train_data, label = train_labels)
xgb.test <- xgb.DMatrix(data = test_data, label = test_labels)

In [66]:
params = list(
  booster="gbtree",
  eta=0.001,
  max_depth=5,
  gamma=3,
  subsample=0.75,
  colsample_bytree=1,
  objective="multi:softprob",
  eval_metric="mlogloss",
  num_class=length(label_levels)
)

In [67]:
xgb.fit=xgb.train(
    params=params,
    data=xgb.train,
    nrounds=10000,
    nthreads=1,
    early_stopping_rounds=10,
    watchlist=list(val1=xgb.train,val2=xgb.test),
    verbose=0
)

In [68]:
xgb.fit

##### xgb.Booster
raw: 119.9 Kb 
call:
  xgb.train(params = params, data = xgb.train, nrounds = 10000, 
    watchlist = list(val1 = xgb.train, val2 = xgb.test), verbose = 0, 
    early_stopping_rounds = 10, nthreads = 1)
params (as set within xgb.train):
  booster = "gbtree", eta = "0.001", max_depth = "5", gamma = "3", subsample = "0.75", colsample_bytree = "1", objective = "multi:softprob", eval_metric = "mlogloss", num_class = "11", nthreads = "1", silent = "1"
xgb.attributes:
  best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
  cb.evaluation.log()
  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
    verbose = verbose)
# of features: 2 
niter: 59
best_iteration : 49 
best_ntreelimit : 49 
best_score : 2.370807 
nfeatures : 2 
evaluation_log:
    iter val1_mlogloss val2_mlogloss
       1      2.396112      2.383826
       2      2.394732      2.383814
---                                 
      58      2.311654      2.380288
      5

In [69]:
# Predict outcomes with the test data
xgb.pred = predict(xgb.fit,test_data,reshape=T)
xgb.pred = as.data.frame(xgb.pred)
colnames(xgb.pred) = label_levels

In [70]:
xgb.pred[sample(nrow(xgb.pred), 5), ]

Unnamed: 0,0,1,3,2,10,9,8,7,6,5,4
4970596,0.09811977,0.09023055,0.09017438,0.09020131,0.09020481,0.0901709,0.09021422,0.09016385,0.09016264,0.09018956,0.09016795
3031942,0.09811977,0.09023055,0.09017438,0.09020131,0.09020481,0.0901709,0.09021422,0.09016385,0.09016264,0.09018956,0.09016795
3930986,0.09811977,0.09023055,0.09017438,0.09020131,0.09020481,0.0901709,0.09021422,0.09016385,0.09016264,0.09018956,0.09016795
3068827,0.09811977,0.09023055,0.09017438,0.09020131,0.09020481,0.0901709,0.09021422,0.09016385,0.09016264,0.09018956,0.09016795
1200528,0.09811977,0.09023055,0.09017438,0.09020131,0.09020481,0.0901709,0.09021422,0.09016385,0.09016264,0.09018956,0.09016795


In [71]:
# Use the predicted label with the highest probability
xgb.pred$prediction = apply(xgb.pred,1,function(x) colnames(xgb.pred)[which.max(x)])
xgb.pred$label = label_levels[test_labels+1]

In [72]:
result = sum(xgb.pred$prediction==xgb.pred$label)/nrow(xgb.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result)))

[1] "Final Accuracy = 24.80%"
