In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
detector_names = [
    'AutoEncoder (AE)',
    'DenoisingAutoEncoder (DAE)',     
    #'DeepAnT', 
    #'Random Black Forest (RR)', 
    'PCC', 
    'HBOS', 
    #'Torsk', 
    #'EncDec-AD', 
    #'Hybrid KNN',
    'CBLOF', 
    'COPOD', 
    'RobustPCA', 
    'LOF'
]

In [4]:
# get time series executed by all detector_names
from collections import defaultdict
scores_per_detector = defaultdict(list)
import shutil
path_to_scores = './scores/OBSEA/'
for detector in os.listdir(path_to_scores):
    if detector not in detector_names:
        shutil.rmtree(path_to_scores + detector)
    else:
        scores_per_detector[detector] = os.listdir(path_to_scores + detector + '/score')

timeseries_executed_by_all = set.intersection(*(set(lst) for lst in scores_per_detector.values()))
timeseries_executed_by_all = list(timeseries_executed_by_all)
timeseries_executed_by_all = sorted(timeseries_executed_by_all)

In [5]:
# delete scores not executed by at least one detector
for detector in os.listdir(path_to_scores):
    if detector not in detector_names:
        continue
    for fname in os.listdir(path_to_scores + detector + '/score'):
        if fname not in timeseries_executed_by_all:
            if '.out' not in fname:
                continue
            #print('to remove', path_to_scores + detector + '/score/' + fname)
            os.remove(path_to_scores + detector + '/score/' + fname)

In [6]:
# delete timeseries data not executed by at least one detector
path_to_data = './data/OBSEA/'
for fname in os.listdir(path_to_data):
    if fname not in timeseries_executed_by_all:
        if '.out' not in fname:
            continue
        #print('to remove', path_to_data + fname)
        os.remove(path_to_data + fname)

In [7]:
# rescale the scores to 0-1
from sklearn.preprocessing import MinMaxScaler
for detector in os.listdir(path_to_scores):
    if detector not in detector_names:
        continue
    for fname in os.listdir(path_to_scores + detector + '/score'):
        if '.out' not in fname:
            continue
            
        score = np.loadtxt(path_to_scores + detector + '/score/' + fname, delimiter='\n')
        score = np.reshape(score, (-1, 1))
        score = np.nan_to_num(score, nan=0.0)
        score = MinMaxScaler().fit_transform(score)
        np.savetxt(path_to_scores + detector + '/score/' + fname, score, delimiter='\n', fmt='%f')


In [9]:
# remove accuracy of timeseries already deleted
import shutil
import pandas as pd
metric_name = 'PR_AUC'
path_to_metrics = './metrics/'
index_metric = ['OBSEA/' + x for x in timeseries_executed_by_all]
for detector in os.listdir(path_to_metrics):
    if detector not in detector_names:
        shutil.rmtree(path_to_metrics + detector)
    else:
        metric = pd.read_csv(path_to_metrics + detector + '/' + metric_name + '.csv')
        metric.set_index('Unnamed: 0', inplace=True)
        metric = metric.loc[index_metric]
        metric.to_csv(path_to_metrics + detector + '/' + metric_name + '.csv')

AutoEncoder (AE)
CBLOF
COPOD
DenoisingAutoEncoder (DAE)
HBOS
LOF
PCC
RobustPCA


In [17]:
metric = pd.read_csv(path_to_metrics + detector + '/' + metric_name + '.csv')
metric.set_index('Unnamed: 0', inplace=True)
metric = metric.loc[index_metric]
metric.to_csv('test.csv')

In [15]:
metric.index

Index(['OBSEA/2020-01-01.out', 'OBSEA/2020-01-02.out', 'OBSEA/2020-01-03.out',
       'OBSEA/2020-01-04.out', 'OBSEA/2020-01-05.out', 'OBSEA/2020-01-06.out',
       'OBSEA/2020-01-07.out', 'OBSEA/2020-01-08.out', 'OBSEA/2020-01-09.out',
       'OBSEA/2020-01-10.out',
       ...
       'OBSEA/2023-10-29.out', 'OBSEA/2023-10-30.out', 'OBSEA/2023-10-31.out',
       'OBSEA/2023-11-01.out', 'OBSEA/2023-11-02.out', 'OBSEA/2023-11-03.out',
       'OBSEA/2023-11-04.out', 'OBSEA/2023-11-05.out', 'OBSEA/2023-11-06.out',
       'OBSEA/2023-11-07.out'],
      dtype='object', name='Unnamed: 0', length=1012)