In [23]:
# install.packages('pROC')
# install.packages('xgboost')

In [46]:
library(data.table)
library(caret)
library(pROC)
library(plyr)
library(boot)
library(tidyverse)
library(xgboost)

In [47]:
batch_size = 500000
split = 0.7

In [48]:
add_batch_and_factor <- function(data){
    
    data$batch = ceiling((data$time * 10000) / batch_size)
    
    data <- transform(
        data,
        time=as.numeric(time),
        signal=as.numeric(signal),
        open_channels=as.numeric(open_channels),
        batch=as.factor(batch)
    )

    return(data)
}

In [101]:
apply_algorithm <- function(raw_data, bath){
    
    data = raw_data[raw_data$batch == batch, ]
    
    if ((0 %in% data$open_channels) == FALSE) {
        data$open_channels = data$open_channels - 1
    }

    label_levels = unique(data$open_channels)
    
    params = list(
      booster="gbtree",
      eta=0.001,
      max_depth=5,
      gamma=3,
      subsample=0.75,
      colsample_bytree=1,
      objective="multi:softprob",
      eval_metric="mlogloss",
      num_class=length(label_levels)
    )

    train_data_features = data[features]
    train_data_labels = data[label]

    train_data_features_matrix = data.matrix(train_data_features)
    train_data_labels_matrix =  data.matrix(train_data_labels)

    numberOfTrainingSamples <- round(length(data) * split)

    # training data
    train_data <- train_data_features_matrix[1:numberOfTrainingSamples,]
    train_labels <- train_data_labels_matrix[1:numberOfTrainingSamples]

    # testing data
    test_data <- train_data_features_matrix[-(1:numberOfTrainingSamples),]
    test_labels <- train_data_labels_matrix[-(1:numberOfTrainingSamples)]

    xgb.train <- xgb.DMatrix(data = train_data, label = train_labels)
    xgb.test <- xgb.DMatrix(data = test_data, label = test_labels)

    xgb.fit=xgb.train(
        params=params,
        data=xgb.train,
        nrounds=10000,
        nthreads=1,
        early_stopping_rounds=10,
        watchlist=list(val1=xgb.train,val2=xgb.test),
        verbose=0
    )

    xgb.pred = predict(xgb.fit,test_data,reshape=T)
    xgb.pred = as.data.frame(xgb.pred)
    colnames(xgb.pred) = label_levels

    xgb.pred$prediction = apply(xgb.pred,1,function(x) colnames(xgb.pred)[which.max(x)])
    xgb.pred$label = label_levels[test_labels+1]

    result = sum(xgb.pred$prediction==xgb.pred$label)/nrow(xgb.pred)
    print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result)))

}

In [50]:
raw_data = read.csv('train_clean.csv')

In [51]:
raw_data = add_batch_and_factor(raw_data)
sapply(raw_data, class)

In [102]:
features = c('time', 'signal')
label = 'open_channels'

raw_data = add_batch_and_factor(raw_data)
batches = unique(raw_data$batch)
    
for (batch in batches){
    apply_algorithm(raw_data, batch)
}

[1] "Final Accuracy = 96.83%"
[1] "Final Accuracy = 96.13%"
[1] "Final Accuracy = 74.80%"
[1] "Final Accuracy = 36.88%"
[1] "Final Accuracy = 3.65%"
[1] "Final Accuracy = 18.63%"
[1] "Final Accuracy = 75.36%"
[1] "Final Accuracy = 36.34%"
[1] "Final Accuracy = 18.60%"
[1] "Final Accuracy = 3.50%"
