In [142]:
library(data.table)
library(ggplot2)
library(TSrepr)
library(TSdist)
library(dtw)

In [143]:
current_folder="D:/Datasets/IE48B_project"

dist_path=sprintf('%s/distances/project',current_folder)

# Dataset

In [144]:
data=fread("bulk_imbalance.csv")

## Feature Adding

In [145]:
data[, lag_24:=shift(net, 24)]
data[, lag_168:=shift(net, 168)]
data <- data[seq(-168,-1),]

data$weekday=wday(data$date)
data[, is_wday:=ifelse(weekday%in%c(2,3,4,5,6), 1, 0)]
data[, is_weekend:=ifelse(weekday%in%c(1,7), 1, 0)]
data$weekday=NULL

In [146]:
feat=fread("2022-01-22_weather.csv")

In [147]:
wide_feat=dcast(feat, date + hour ~ variable+ lat + lon , value.var="value")
wide_feat=wide_feat[date<Sys.Date(),]

In [148]:
str(wide_feat)

Classes 'data.table' and 'data.frame':	26880 obs. of  44 variables:
 $ date                           : IDate, format: "2018-12-31" "2018-12-31" ...
 $ hour                           : int  0 1 2 3 4 5 6 7 8 9 ...
 $ DSWRF_surface_36.5_32.5        : num  0 0 0 0 0 ...
 $ DSWRF_surface_37_35.5          : num  0 0 0 0 0 ...
 $ DSWRF_surface_38_32.5          : num  0 0 0 0 0 ...
 $ DSWRF_surface_38.5_27          : num  0 0 0 0 0 0 0 0 0 0 ...
 $ DSWRF_surface_39.75_30.5       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ DSWRF_surface_40_33            : num  0 0 0 0 0 0 0 0 0 0 ...
 $ DSWRF_surface_41_28.75         : num  0 0 0 0 0 0 0 0 0 0 ...
 $ RH_2.m.above.ground_36.5_32.5  : num  83.3 82 80.7 79.4 78.8 ...
 $ RH_2.m.above.ground_37_35.5    : num  76.1 74.6 73.1 71.6 71.3 71 70.7 70.9 71.1 71.3 ...
 $ RH_2.m.above.ground_38_32.5    : num  93.2 92.9 92.7 92.4 92.1 ...
 $ RH_2.m.above.ground_38.5_27    : num  75.2 76 76.9 77.7 77.6 ...
 $ RH_2.m.above.ground_39.75_30.5 : num  88.7 88.5 88.4 88.2 8

# PCA

### DSWRF

In [149]:
str(wide_feat)

Classes 'data.table' and 'data.frame':	26880 obs. of  44 variables:
 $ date                           : IDate, format: "2018-12-31" "2018-12-31" ...
 $ hour                           : int  0 1 2 3 4 5 6 7 8 9 ...
 $ DSWRF_surface_36.5_32.5        : num  0 0 0 0 0 ...
 $ DSWRF_surface_37_35.5          : num  0 0 0 0 0 ...
 $ DSWRF_surface_38_32.5          : num  0 0 0 0 0 ...
 $ DSWRF_surface_38.5_27          : num  0 0 0 0 0 0 0 0 0 0 ...
 $ DSWRF_surface_39.75_30.5       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ DSWRF_surface_40_33            : num  0 0 0 0 0 0 0 0 0 0 ...
 $ DSWRF_surface_41_28.75         : num  0 0 0 0 0 0 0 0 0 0 ...
 $ RH_2.m.above.ground_36.5_32.5  : num  83.3 82 80.7 79.4 78.8 ...
 $ RH_2.m.above.ground_37_35.5    : num  76.1 74.6 73.1 71.6 71.3 71 70.7 70.9 71.1 71.3 ...
 $ RH_2.m.above.ground_38_32.5    : num  93.2 92.9 92.7 92.4 92.1 ...
 $ RH_2.m.above.ground_38.5_27    : num  75.2 76 76.9 77.7 77.6 ...
 $ RH_2.m.above.ground_39.75_30.5 : num  88.7 88.5 88.4 88.2 8

In [150]:
pca_DSWRF=princomp(wide_feat[,seq(1:7)+2,with=F])
summary(pca_DSWRF,loadings=T)

Importance of components:
                            Comp.1     Comp.2       Comp.3       Comp.4
Standard deviation     698.6375896 91.9068787 70.626832627 56.342454516
Proportion of Variance   0.9529455  0.0164915  0.009738759  0.006197773
Cumulative Proportion    0.9529455  0.9694370  0.979175749  0.985373523
                             Comp.5       Comp.6       Comp.7
Standard deviation     51.822317537 50.568720451 47.422244122
Proportion of Variance  0.005243217  0.004992615  0.004390645
Cumulative Proportion   0.990616739  0.995609355  1.000000000

Loadings:
                         Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
DSWRF_surface_36.5_32.5   0.403  0.385  0.426  0.131  0.234  0.648  0.123
DSWRF_surface_37_35.5     0.379  0.390  0.280 -0.601 -0.253 -0.446       
DSWRF_surface_38_32.5     0.389  0.291 -0.221  0.557  0.372 -0.516       
DSWRF_surface_38.5_27     0.378 -0.389  0.345  0.415 -0.586        -0.263
DSWRF_surface_39.75_30.5  0.370 -0.253 -0.371        -0.2

### RH_2.m.above

In [151]:
pca_RH=princomp(wide_feat[,seq(1:7)+9,with=F])
summary(pca_RH,loadings=T)

Importance of components:
                           Comp.1     Comp.2      Comp.3      Comp.4     Comp.5
Standard deviation     44.5869745 17.3207688 15.80892988 13.08547168 9.34503795
Proportion of Variance  0.6820781  0.1029325  0.08574782  0.05874851 0.02996265
Cumulative Proportion   0.6820781  0.7850106  0.87075843  0.92950694 0.95946959
                           Comp.6     Comp.7
Standard deviation     7.86807903 7.49826985
Proportion of Variance 0.02124005 0.01929036
Cumulative Proportion  0.98070964 1.00000000

Loadings:
                               Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
RH_2.m.above.ground_36.5_32.5   0.418         0.779  0.284  0.345         0.128
RH_2.m.above.ground_37_35.5     0.359  0.892        -0.172 -0.193              
RH_2.m.above.ground_38_32.5     0.420 -0.333  0.211 -0.265 -0.539        -0.545
RH_2.m.above.ground_38.5_27     0.350        -0.306  0.675 -0.240 -0.515       
RH_2.m.above.ground_39.75_30.5  0.429 -0.109 -0.383 -0.242  0.6

### TCDC_low

In [152]:
pca_TCDC=princomp(wide_feat[,seq(1:7)+16,with=F])
summary(pca_TCDC,loadings=T)

Importance of components:
                           Comp.1     Comp.2     Comp.3      Comp.4      Comp.5
Standard deviation     58.1096267 32.3675786 27.1512582 22.73640511 21.70951712
Proportion of Variance  0.4883719  0.1515217  0.1066189  0.07476495  0.06816396
Cumulative Proportion   0.4883719  0.6398936  0.7465125  0.82127750  0.88944145
                            Comp.6      Comp.7
Standard deviation     20.50490703 18.54667120
Proportion of Variance  0.06080931  0.04974924
Cumulative Proportion   0.95025076  1.00000000

Loadings:
                                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
TCDC_low.cloud.layer_36.5_32.5   0.392  0.377  0.342  0.332  0.321       
TCDC_low.cloud.layer_37_35.5     0.290  0.459  0.425 -0.430 -0.545 -0.111
TCDC_low.cloud.layer_38_32.5     0.346  0.322 -0.176 -0.148  0.564  0.269
TCDC_low.cloud.layer_38.5_27     0.224 -0.166  0.236  0.732 -0.317  0.203
TCDC_low.cloud.layer_39.75_30.5  0.451 -0.167 -0.478 -0.162 -0.351  0.558
TCDC_low.cl

### TMP_2.m.above.ground

In [153]:
pca_TMP=princomp(wide_feat[,seq(1:7)+23,with=F])
summary(pca_TMP,loadings=T)

Importance of components:
                           Comp.1     Comp.2     Comp.3     Comp.4     Comp.5
Standard deviation     22.4644769 2.96560779 2.50402491 2.11736945 1.68558070
Proportion of Variance  0.9520682 0.01659216 0.01182913 0.00845802 0.00536012
Cumulative Proportion   0.9520682 0.96866036 0.98048949 0.98894751 0.99430763
                            Comp.6      Comp.7
Standard deviation     1.373985149 1.062760528
Proportion of Variance 0.003561554 0.002130818
Cumulative Proportion  0.997869182 1.000000000

Loadings:
                                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
TMP_2.m.above.ground_36.5_32.5   0.383  0.393  0.171  0.168  0.627  0.488
TMP_2.m.above.ground_37_35.5     0.384  0.608  0.370        -0.554 -0.179
TMP_2.m.above.ground_38_32.5     0.403        -0.439  0.293  0.224 -0.466
TMP_2.m.above.ground_38.5_27     0.369 -0.296  0.335 -0.661  0.313 -0.355
TMP_2.m.above.ground_39.75_30.5  0.391 -0.251 -0.296 -0.333 -0.331  0.607
TMP_2.m.above.groun

### w Values

In [154]:
pca_w=princomp(wide_feat[,seq(1:14)+30,with=F])
summary(pca_w,loadings=T)

Importance of components:
                            Comp.1      Comp.2     Comp.3     Comp.4     Comp.5
Standard deviation     129.5758022 111.7800382 92.1862191 85.4215184 81.9200746
Proportion of Variance   0.2695446   0.2005909  0.1364315  0.1171433  0.1077367
Cumulative Proportion    0.2695446   0.4701355  0.6065671  0.7237104  0.8314470
                            Comp.6      Comp.7       Comp.8       Comp.9
Standard deviation     75.63668587 68.93272795 3.4434155012 2.184001e+00
Proportion of Variance  0.09184336  0.07628405 0.0001903538 7.657529e-05
Cumulative Proportion   0.92329037  0.99957442 0.9997647738 9.998413e-01
                            Comp.10      Comp.11      Comp.12      Comp.13
Standard deviation     1.670075e+00 1.622508e+00 1.402831e+00 1.137171e+00
Proportion of Variance 4.477695e-05 4.226265e-05 3.159321e-05 2.076034e-05
Cumulative Proportion  9.998861e-01 9.999284e-01 9.999600e-01 9.999807e-01
                            Comp.14
Standard deviation     1.0

## New Feature Dataset

In [155]:
new_feat=data.table(date=wide_feat$date, hour=wide_feat$hour, DSWRF=pca_DSWRF$scores[,1], RH=pca_RH$scores[,1],
                    TCDC1 = pca_TCDC$scores[,1], TCDC2 = pca_TCDC$scores[, 2], TMP = pca_TMP$scores[, 1], w1 = pca_w$scores[,1],
                    w2 = pca_w$scores[,2], w3 = pca_w$scores[,3])

In [156]:
head(new_feat)

date,hour,DSWRF,RH,TCDC1,TCDC2,TMP,w1,w2,w3
2018-12-31,0,-569.9611,64.3523,48.64323,6.501146,-40.09289,81.52956,-73.72133,-23.73527
2018-12-31,1,-569.9611,63.22281,42.236239,6.855581,-40.50658,85.82155,-83.52298,-32.7048
2018-12-31,2,-569.9611,62.09332,35.829249,7.210017,-40.92027,90.11354,-93.32463,-41.67434
2018-12-31,3,-569.9611,60.96383,29.422258,7.564453,-41.33397,94.40553,-103.12628,-50.64387
2018-12-31,4,-569.9611,59.88503,13.947211,-2.686073,-42.00791,97.00861,-70.70537,-14.21063
2018-12-31,5,-569.9611,58.80623,-1.527836,-12.936599,-42.68185,99.61169,-38.28447,22.22261


In [157]:
all_dt=merge(x = data, y=new_feat, by=c("date", "hour"), all.x=TRUE)

In [158]:
head(all_dt)

date,hour,net,upRegulationZeroCoded,upRegulationOneCoded,upRegulationTwoCoded,downRegulationZeroCoded,downRegulationOneCoded,downRegulationTwoCoded,upRegulationDelivered,...,is_wday,is_weekend,DSWRF,RH,TCDC1,TCDC2,TMP,w1,w2,w3
2019-01-08,0,-464.4,0.0,0,0,761.616,0,0,0.0,...,1,0,-569.9611,61.86299,98.70077,30.45218,-42.4428,-19.35854,-126.337284,-125.995139
2019-01-08,1,-8.662,245.833,0,0,259.067,0,0,206.833,...,1,0,-569.9611,62.38798,95.0083,31.00809,-42.51784,-26.14478,-69.761402,-65.321967
2019-01-08,2,70.445,274.748,0,0,17.1,0,0,85.145,...,1,0,-569.9611,62.91296,91.31583,31.56401,-42.59289,-32.93102,-13.18552,-4.648795
2019-01-08,3,60.549,352.0,0,0,0.0,0,0,60.549,...,1,0,-569.9611,63.43795,87.62336,32.11992,-42.66794,-39.71726,43.390362,56.024377
2019-01-08,4,43.14,352.0,0,0,0.0,0,0,43.14,...,1,0,-569.9611,67.10149,86.17535,23.69623,-42.78473,-48.66498,20.914823,19.575963
2019-01-08,5,25.49,212.0,0,0,0.0,0,0,25.49,...,1,0,-569.9611,70.76503,84.72734,15.27253,-42.90151,-57.61271,-1.560716,-16.872452


## Splitting Dataset

In [168]:
train_start_date="2021-11-15"

In [169]:
test_date_start="2021-12-01"
test_date_end="2021-12-14"

In [170]:
train=all_dt[(date>=train_start_date) & (date<test_date_start),]
test=all_dt[(date>=test_date_start) & (date<=test_date_end),]

In [171]:
train=train[,-c("date", 'net', 'upRegulationZeroCoded', 'upRegulationOneCoded', 'upRegulationTwoCoded',
                'downRegulationZeroCoded', 'downRegulationOneCoded', 'downRegulationTwoCoded', 'upRegulationDelivered',
                'downRegulationDelivered')]
test=test[,-c("date", 'net', 'upRegulationZeroCoded', 'upRegulationOneCoded', 'upRegulationTwoCoded',
                'downRegulationZeroCoded', 'downRegulationOneCoded', 'downRegulationTwoCoded', 'upRegulationDelivered',
                'downRegulationDelivered')]

In [172]:
str(train)

Classes 'data.table' and 'data.frame':	384 obs. of  14 variables:
 $ hour            : int  0 1 2 3 4 5 6 7 8 9 ...
 $ system_direction: chr  "Negative" "Positive" "Positive" "Positive" ...
 $ lag_24          : num  864 757 163 505 623 ...
 $ lag_168         : num  188 491 300 0 0 ...
 $ is_wday         : num  1 1 1 1 1 1 1 1 1 1 ...
 $ is_weekend      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ DSWRF           : num  -570 -570 -570 -570 -570 ...
 $ RH              : num  -35.5 -33.8 -33.9 -32 -30.6 ...
 $ TCDC1           : num  -46.5 -46.5 -46.5 -46.5 -46.5 ...
 $ TCDC2           : num  2.7 2.7 2.7 2.7 2.77 ...
 $ TMP             : num  -10.1 -11.1 -11.3 -12.2 -12.5 ...
 $ w1              : num  75 77.7 76.3 46.4 52.4 ...
 $ w2              : num  93.1 94.2 107.8 59.4 14.2 ...
 $ w3              : num  50.776 58.044 52.624 22.2 0.197 ...
 - attr(*, ".internal.selfref")=<externalptr> 


### Train 

In [173]:
trainclass=train$system_direction
traindata=as.matrix(train[, -c(2),with=F])

traindata=scale(traindata)

### Test

In [174]:
testclass=test$system_direction
testdata=as.matrix(test[, -c(2),with=F])

testdata=scale(testdata)

### Dataset Specs

In [175]:
tlength=ncol(traindata)
n_series_train=nrow(traindata)
n_series_test=nrow(testdata)

In [27]:
## Alternative strategies
# knn k=1 Euclidean distance
# knn k=5 Euclidean distance
# knn k=1 DTW distance + no window
# knn k=5 DTW distance + no window
# knn k=1 DTW distance + window.type='sakoechiba' + window.size=10
# knn k=1 DTW distance + window.type='sakoechiba' + window.size=20
# knn k=1 LCSS + epsilon=0.05 + no window
# knn k=5 LCSS + epsilon=0.1  + no window 
# knn k=1 ERP + gap penalty=1 + no window
# knn k=1 ERP + gap penalty=0.5 + no window
# knn k=5 ERP + gap penalty=0.5 + no window


# calculate distances and store them to save time
large_number=10000
dist_euc=as.matrix(dist(traindata))
diag(dist_euc)=large_number
fwrite(dist_euc,sprintf('%s_euc_raw_dist.csv',dist_path),col.names=F)

dist_dtw=as.matrix(dtwDist(traindata))
diag(dist_dtw)=large_number
fwrite(dist_dtw,sprintf('%s_dtw_raw_dist.csv',dist_path),col.names=F)

dist_dtw1=as.matrix(dtwDist(traindata,window.type='sakoechiba',window.size=10))
diag(dist_dtw1)=large_number
fwrite(dist_dtw1,sprintf('%s_dtw_raw_dist_sakoe_10.csv',dist_path),col.names=F)

dist_dtw2=as.matrix(dtwDist(traindata,window.type='sakoechiba',window.size=20))
diag(dist_dtw2)=large_number
fwrite(dist_dtw2,sprintf('%s_dtw_raw_dist_sakoe_20.csv',dist_path),col.names=F)  

x being coerced from class: matrix to data.table
x being coerced from class: matrix to data.table
x being coerced from class: matrix to data.table
x being coerced from class: matrix to data.table


In [28]:
# computation of LCSS is from TSdist package
dist_lcss=TSDatabaseDistances(traindata,distance='lcss',epsilon=0.05)
dist_lcss=as.matrix(dist_lcss)
diag(dist_lcss)=large_number
fwrite(dist_lcss,sprintf('%s_lcss_raw_epsilon_005.csv',dist_path),col.names=F)  
                   
dist_lcss=TSDatabaseDistances(traindata,distance='lcss',epsilon=0.1)
dist_lcss=as.matrix(dist_lcss)
diag(dist_lcss)=large_number
fwrite(dist_lcss,sprintf('%s_lcss_raw_epsilon_01.csv',dist_path),col.names=F) 

# computation of ERP is from TSdist package
dist_erp=TSDatabaseDistances(traindata,distance='erp',g=0.5)
dist_erp=as.matrix(dist_erp)
diag(dist_erp)=large_number
fwrite(dist_erp,sprintf('%s_erp_raw_gap_005.csv',dist_path),col.names=F)  
                   
dist_erp=TSDatabaseDistances(traindata,distance='erp',g=1)
dist_erp=as.matrix(dist_erp)
diag(dist_erp)=large_number
fwrite(dist_erp,sprintf('%s_erp_raw_gap_1.csv',dist_path),col.names=F) 

x being coerced from class: matrix to data.table
x being coerced from class: matrix to data.table
x being coerced from class: matrix to data.table
x being coerced from class: matrix to data.table


## Main Function

In [28]:
nn_classify_cv=function(dist_matrix,train_class,test_indices,k=1){
    
    test_distances_to_train=dist_matrix[test_indices,]
    test_distances_to_train=test_distances_to_train[,-test_indices]
    train_class=train_class[-test_indices]
    #print(str(test_distances_to_train))
    ordered_indices=apply(test_distances_to_train,1,order)
    if(k==1){
        nearest_class=as.numeric(trainclass[as.numeric(ordered_indices[1,])])
        nearest_class=data.table(id=test_indices,nearest_class)
    } else {
        nearest_class=apply(ordered_indices[1:k,],2,function(x) {trainclass[x]})
        nearest_class=data.table(id=test_indices,t(nearest_class))
    }
    
    long_nn_class=melt(nearest_class,'id')

    class_counts=long_nn_class[,.N,list(id,value)]
    class_counts[,predicted_prob:=N/k]
    wide_class_prob_predictions=dcast(class_counts,id~value,value.var='predicted_prob')
    wide_class_prob_predictions[is.na(wide_class_prob_predictions)]=0
    class_predictions=class_counts[,list(predicted=value[which.max(N)]),by=list(id)]
    
    
    return(list(prediction=class_predictions,prob_estimates=wide_class_prob_predictions))
    
}

In [29]:
result=nn_classify_cv(dist_lcss,trainclass,1:3,k=5)
str(result)

ERROR: Error in nn_classify_cv(dist_lcss, trainclass, 1:3, k = 5): 'dist_lcss' nesnesi bulunamadı


In [30]:
# cv indices start here
require(TunePareto)

set.seed(13429)
nof_rep=10
n_fold=10
cv_indices=generateCVRuns(trainclass, ntimes =nof_rep, nfold = n_fold, 
                          leaveOneOut = FALSE, stratified = TRUE)

str(cv_indices)

dist_folder=sprintf('%s/distances/',current_folder)
dist_files=list.files(dist_folder, full.names=T)


Loading required package: TunePareto


List of 10
 $ Run  1 :List of 10
  ..$ Fold  1 : int [1:39] 373 268 20 176 151 191 181 353 228 95 ...
  ..$ Fold  2 : int [1:39] 169 153 29 184 185 339 46 324 118 183 ...
  ..$ Fold  3 : int [1:39] 189 310 372 53 380 382 21 262 200 371 ...
  ..$ Fold  4 : int [1:39] 175 366 172 160 7 375 376 128 111 162 ...
  ..$ Fold  5 : int [1:38] 279 321 361 182 22 383 149 112 120 304 ...
  ..$ Fold  6 : int [1:38] 282 267 381 27 32 269 378 90 218 308 ...
  ..$ Fold  7 : int [1:38] 45 271 263 25 264 28 174 329 302 327 ...
  ..$ Fold  8 : int [1:38] 332 170 31 24 281 363 1 104 14 296 ...
  ..$ Fold  9 : int [1:38] 374 150 30 154 23 379 152 278 257 220 ...
  ..$ Fold  10: int [1:38] 365 173 280 270 171 214 377 156 223 130 ...
 $ Run  2 :List of 10
  ..$ Fold  1 : int [1:39] 173 21 365 321 185 281 214 111 183 255 ...
  ..$ Fold  2 : int [1:39] 149 264 174 375 150 267 24 148 259 342 ...
  ..$ Fold  3 : int [1:39] 377 169 20 171 191 181 184 228 348 71 ...
  ..$ Fold  4 : int [1:39] 361 383 23 28 45 282 

In [31]:
list.files(dist_folder)

In [32]:
k_levels=c(2,5,10)
approach_file=list.files(dist_folder)
result=vector('list',length(dist_files)*nof_rep*n_fold*length(k_levels))
iter=1
for(m in 1:length(dist_files)){ #
    print(dist_files[m])
    dist_mat=as.matrix(fread(dist_files[m],header=FALSE))
    for(i in 1:nof_rep){
        this_fold=cv_indices[[i]]
        for(j in 1:n_fold){
            test_indices=this_fold[[j]]
            for(k in 1:length(k_levels)){
                current_k=k_levels[k]
                current_fold=nn_classify_cv(dist_mat,trainclass,test_indices,k=current_k)
                accuracy=sum(trainclass[test_indices]==current_fold$prediction$predicted)/length(test_indices)
                tmp=data.table(approach=approach_file[m],repid=i,foldid=j,
                               k=current_k,acc=accuracy)
                result[[iter]]=tmp
                iter=iter+1
                
            }
            
        }
    
    }   
    
}


[1] "D:/Datasets/IE48B_project/distances/project_dtw_raw_dist.csv"
[1] "D:/Datasets/IE48B_project/distances/project_dtw_raw_dist_sakoe_10.csv"
[1] "D:/Datasets/IE48B_project/distances/project_dtw_raw_dist_sakoe_20.csv"
[1] "D:/Datasets/IE48B_project/distances/project_erp_raw_gap_005.csv"
[1] "D:/Datasets/IE48B_project/distances/project_erp_raw_gap_1.csv"
[1] "D:/Datasets/IE48B_project/distances/project_euc_raw_dist.csv"
[1] "D:/Datasets/IE48B_project/distances/project_lcss_raw_epsilon_005.csv"
[1] "D:/Datasets/IE48B_project/distances/project_lcss_raw_epsilon_01.csv"


In [33]:
overall_results=rbindlist(result)
overall_results[,list(avg_acc=mean(acc),sdev_acc=sd(acc),result_count=.N),by=list(approach,k)]

approach,k,avg_acc,sdev_acc,result_count
project_dtw_raw_dist.csv,2,0.6430769,0.06964976,100
project_dtw_raw_dist.csv,5,0.6802294,0.05341675,100
project_dtw_raw_dist.csv,10,0.7200742,0.03903751,100
project_dtw_raw_dist_sakoe_10.csv,2,0.6430769,0.06964976,100
project_dtw_raw_dist_sakoe_10.csv,5,0.6802294,0.05341675,100
project_dtw_raw_dist_sakoe_10.csv,10,0.7200742,0.03903751,100
project_dtw_raw_dist_sakoe_20.csv,2,0.6430769,0.06964976,100
project_dtw_raw_dist_sakoe_20.csv,5,0.6802294,0.05341675,100
project_dtw_raw_dist_sakoe_20.csv,10,0.7200742,0.03903751,100
project_erp_raw_gap_005.csv,2,0.6399798,0.06650581,100


In [69]:
head(res_dt[order(avg_acc,decreasing = TRUE)], 8)

approach,k,avg_acc,sdev_acc,result_count
project_lcss_raw_epsilon_01.csv,10,0.7333198,0.01551811,100
project_lcss_raw_epsilon_005.csv,10,0.7319771,0.01909082,100
project_dtw_raw_dist.csv,10,0.7200742,0.03903751,100
project_dtw_raw_dist_sakoe_10.csv,10,0.7200742,0.03903751,100
project_dtw_raw_dist_sakoe_20.csv,10,0.7200742,0.03903751,100
project_erp_raw_gap_1.csv,10,0.7198178,0.0387583,100
project_erp_raw_gap_005.csv,10,0.7182456,0.0407253,100
project_euc_raw_dist.csv,10,0.7133131,0.04276158,100


In [None]:
require(ggplot2)
ggplot(overall_results,aes(x=paste0(approach,'_when k:',k), y=acc)) +
geom_boxplot()+ xlab("Models")+ylab("Box Plot of Accuracy")+
coord_flip()

## Prediction Part

In [176]:
all_dt=all_dt[(date<="2021-12-14")& (date>="2021-01-01"),]

In [177]:
all_dt=all_dt[,-c("date", 'net', 'upRegulationZeroCoded', 'upRegulationOneCoded', 'upRegulationTwoCoded',
                'downRegulationZeroCoded', 'downRegulationOneCoded', 'downRegulationTwoCoded', 'upRegulationDelivered',
                'downRegulationDelivered')]

In [178]:
allclass=all_dt$system_direction
alldata=as.matrix(all_dt[, -c(2),with=F])

alldata=scale(alldata)

### Distance Matrix

In [179]:
large_number=10000
dist_euc_all=as.matrix(dist(alldata))
diag(dist_euc_all)=large_number

In [180]:
test_indices=(nrow(alldata)-335):nrow(alldata)# 2 weeks 
k=10 
dist_matrix_all=dist_euc_all
test_distances_all=dist_matrix_all[test_indices,]

In [181]:
ordered_indices=apply(test_distances_all,1,order)
nearest_class=apply(ordered_indices[1:k,],2,function(x) {allclass[x]})
nearest_class=data.table(id=test_indices,t(nearest_class))

#nearest_class
long_nn_class=melt(nearest_class,'id')

In [182]:
class_counts=long_nn_class[,.N,list(id,value)]
class_counts[,predicted_prob:=N/k]
wide_class_prob_predictions=dcast(class_counts,id~value,value.var='predicted_prob')

In [183]:
wide_class_prob_predictions[is.na(wide_class_prob_predictions)]=0


class_predictions=class_counts[,list(predicted=value[which.max(N)]),by=list(id)]

## Obtained Performance

In [188]:
library(caret)

In [190]:
confusionMatrix(data = as.factor(class_predictions$predicted), reference = as.factor(all_dt[(nrow(all_dt)-335):nrow(all_dt)]$system_direction), mode = "prec_recall")

Confusion Matrix and Statistics

          Reference
Prediction Negative Neutral Positive
  Negative       40       7       30
  Neutral         2       5        2
  Positive       28      17      205

Overall Statistics
                                          
               Accuracy : 0.744           
                 95% CI : (0.6939, 0.7899)
    No Information Rate : 0.7054          
    P-Value [Acc > NIR] : 0.065910        
                                          
                  Kappa : 0.3979          
                                          
 Mcnemar's Test P-Value : 0.002103        

Statistics by Class:

                     Class: Negative Class: Neutral Class: Positive
Precision                     0.5195        0.55556          0.8200
Recall                        0.5714        0.17241          0.8650
F1                            0.5442        0.26316          0.8419
Prevalence                    0.2083        0.08631          0.7054
Detection Rate               