In [1]:
# install.packages('pROC')
# install.packages('xgboost')

In [22]:
library(data.table)
library(caret)
library(pROC)
library(plyr)
library(boot)
library(tidyverse)
library(xgboost)

In [35]:
batch_size = 500000
split = 0.7

In [81]:
add_batch_and_factor <- function(data){
    
    data$batch = ceiling((data$time * 10000) / batch_size)
    
    data <- transform(
        data,
        time=as.numeric(time),
        signal=as.numeric(signal),
        open_channels=as.numeric(open_channels),
        batch=as.factor(batch)
    )

    return(data)
}

In [82]:
data = read.csv('train_clean.csv')

In [83]:
data = add_batch_and_factor(data)
sapply(data, class)

In [88]:
data = data[data$batch == 1, ]

In [89]:
data[sample(nrow(data), 5), ]

Unnamed: 0,time,signal,open_channels,batch
131526,13.1526,-2.6117,0,1
113785,11.3785,-2.9132,0,1
359368,35.9368,-2.6331,0,1
14304,1.4304,-2.7789,0,1
292274,29.2274,-2.3129,0,1


In [90]:
label_levels = unique(data$open_channels)
label_levels

In [91]:
train_data_features = data[c('time', 'signal')]
train_data_labels = data['open_channels']

In [92]:
train_data_features_matrix = data.matrix(train_data_features)
train_data_labels_matrix =  data.matrix(train_data_labels)

In [93]:
numberOfTrainingSamples <- round(length(data) * split)

# training data
train_data <- train_data_features_matrix[1:numberOfTrainingSamples,]
train_labels <- train_data_labels_matrix[1:numberOfTrainingSamples]

# testing data
test_data <- train_data_features_matrix[-(1:numberOfTrainingSamples),]
test_labels <- train_data_labels_matrix[-(1:numberOfTrainingSamples)]

In [94]:
xgb.train <- xgb.DMatrix(data = train_data, label = train_labels)
xgb.test <- xgb.DMatrix(data = test_data, label = test_labels)

In [95]:
params = list(
  booster="gbtree",
  eta=0.001,
  max_depth=5,
  gamma=3,
  subsample=0.75,
  colsample_bytree=1,
  objective="multi:softprob",
  eval_metric="mlogloss",
  num_class=length(label_levels)
)

In [96]:
xgb.fit=xgb.train(
    params=params,
    data=xgb.train,
    nrounds=10000,
    nthreads=1,
    early_stopping_rounds=10,
    watchlist=list(val1=xgb.train,val2=xgb.test),
    verbose=0
)

In [97]:
xgb.fit

##### xgb.Booster
raw: 1.7 Mb 
call:
  xgb.train(params = params, data = xgb.train, nrounds = 10000, 
    watchlist = list(val1 = xgb.train, val2 = xgb.test), verbose = 0, 
    early_stopping_rounds = 10, nthreads = 1)
params (as set within xgb.train):
  booster = "gbtree", eta = "0.001", max_depth = "5", gamma = "3", subsample = "0.75", colsample_bytree = "1", objective = "multi:softprob", eval_metric = "mlogloss", num_class = "2", nthreads = "1", silent = "1"
xgb.attributes:
  best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
  cb.evaluation.log()
  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
    verbose = verbose)
# of features: 2 
niter: 4609
best_iteration : 4599 
best_ntreelimit : 4599 
best_score : 0.164249 
nfeatures : 2 
evaluation_log:
    iter val1_mlogloss val2_mlogloss
       1      0.692597      0.692588
       2      0.692098      0.691789
---                                 
    4608      0.088781      0.164269
    

In [98]:
# Predict outcomes with the test data
xgb.pred = predict(xgb.fit,test_data,reshape=T)
xgb.pred = as.data.frame(xgb.pred)
colnames(xgb.pred) = label_levels

In [99]:
xgb.pred[sample(nrow(xgb.pred), 5), ]

Unnamed: 0,0,1
84909,0.9148567,0.08514336
227858,0.9148567,0.08514336
492302,0.9148567,0.08514336
413639,0.9148567,0.08514336
334193,0.9148567,0.08514336


In [100]:
# Use the predicted label with the highest probability
xgb.pred$prediction = apply(xgb.pred,1,function(x) colnames(xgb.pred)[which.max(x)])
xgb.pred$label = label_levels[test_labels+1]

In [101]:
result = sum(xgb.pred$prediction==xgb.pred$label)/nrow(xgb.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result)))

[1] "Final Accuracy = 96.83%"
