In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
mapping_alg = pd.read_csv("../01_parameters_optim/algorithm_mapping.csv")
mapping_alg

Unnamed: 0,official_name,cite_key,folder,ref_name
0,NoveltySVR,~\cite{ma2003online},novelty_svr,NoveltySVR
1,PS-SVM,~\cite{ma2003time},phasespace_svm,PhaseSpace-SVM
2,EmsembleGI,~\cite{gao2020ensemble},ensemble_gi,EmsembleGI
3,GrammarViz,~\cite{senin2015time},grammarviz3,GrammarViz
4,HOT SAX,~\cite{keogh2005hot},hotsax,HOT SAX
5,TSBitmap,~\cite{wei2005assumption},ts_bitmap,TSBitmap
6,NormA-SJ,~\cite{boniol2021unsupervised},norma,NormA
7,SAND,~\cite{boniol2021sand},sand,SAND
8,Series2Graph,~\cite{boniol2022series2graph},series2graph,Series2Graph
9,STAMP,~\cite{yeh2016matrix},stamp,STAMP


In [3]:
# This function is to remove periods when the sensor is considered turned off.
# This can happen, for example, when the sensor is out-of-battery or turned off by humans to avoid polluted water (see Section 2.2.2).
# As agreed with all of our domain experts, such periods should not be measured in terms of precision.

def remove_long_shutdown(numbers, num_consecutive, missing_label):
    chunks = []
    current_chunk = []

    i = 0
    while i < len(numbers)-1:
        num = numbers[i]
        if num != missing_label:
            current_chunk.append(i)
        else:
            j = i+1
            while j < len(numbers):
                if numbers[j] == missing_label:
                    j += 1
                else:
                    break

            if j-i < num_consecutive:
                current_chunk += range(i,min(j+1, len(numbers)))# numbers[i:j+1]
            else:
                chunks.append(current_chunk)
                current_chunk = []

            i=j         

        i+= 1

    # Append the last chunk
    if current_chunk:
        chunks.append(current_chunk)

    to_ret = []
    # Print the chunks
    for i, chunk in enumerate(chunks, 1):
        to_ret += chunk
        
    return to_ret

In [4]:
from scipy.sparse import csc_matrix, hstack
import numpy as np
# code taken from https://github.com/HPI-Information-Systems/TimeEval-algorithms
def post_grammarviz(algorithm_parameter, args) -> np.ndarray:
    if isinstance(algorithm_parameter, np.ndarray):
        results = pd.DataFrame(algorithm_parameter, columns=["index", "score", "length"])
        results = results.set_index("index")
    else:
        results = pd.read_csv(algorithm_parameter, header=None, index_col=0, names=["index", "score", "length"])
    anomalies = results[results["score"] > .0]

    # use scipy sparse matrix to save memory
    matrix = csc_matrix((len(results), 1), dtype=np.float64)
    counts = np.zeros(len(results))
    for i, row in anomalies.iterrows():
        idx = int(row.name)
        length = int(row["length"])
        tmp = np.zeros(len(results))
        tmp[idx:idx + length] = np.repeat([row["score"]], repeats=length)
        tmp = tmp.reshape(-1, 1)
        matrix = hstack([matrix, tmp])
        counts[idx:idx + length] += 1
    sums = matrix.sum(axis=1)
    counts = counts.reshape(-1, 1)
    scores = np.zeros_like(sums)
    np.divide(sums, counts, out=scores, where=counts != 0)
    # returns the completely flattened array (from `[[1.2], [2.3]]` to `[1.2, 2.3]`)
    return scores.A1  # type: ignore

In [5]:
import os
import pandas as pd
import numpy as np
import sklearn
from sklearn import metrics
import traceback
import math
from scipy.sparse import csc_matrix, hstack

# code taken from https://github.com/HPI-Information-Systems/TimeEval-algorithms
def post_grammarviz(algorithm_parameter):
    if isinstance(algorithm_parameter, np.ndarray):
        results = pd.DataFrame(algorithm_parameter, columns=["index", "score", "length"])
        results = results.set_index("index")
    else:
        results = pd.read_csv(algorithm_parameter, header=None, index_col=0, names=["index", "score", "length"])
    anomalies = results[results["score"] > .0]

    # use scipy sparse matrix to save memory
    matrix = csc_matrix((len(results), 1), dtype=np.float64)
    counts = np.zeros(len(results))
    for i, row in anomalies.iterrows():
        idx = int(row.name)
        length = int(row["length"])
        tmp = np.zeros(len(results))
        tmp[idx:idx + length] = np.repeat([row["score"]], repeats=length)
        tmp = tmp.reshape(-1, 1)
        matrix = hstack([matrix, tmp])
        counts[idx:idx + length] += 1
    sums = matrix.sum(axis=1)
    counts = counts.reshape(-1, 1)
    scores = np.zeros_like(sums)
    np.divide(sums, counts, out=scores, where=counts != 0)
    # returns the completely flattened array (from `[[1.2], [2.3]]` to `[1.2, 2.3]`)
    return scores.A1  # type: ignore

def calculate_roc(path_to_result, label_file, data_source, feasibility=None, validation=None):
    try:
        roc_scores = dict()
        roc_scores_file = dict()
        pr_scores = dict()
        pr_scores_file = dict()
        
        i = 0
        for result_file in os.listdir(path_to_result):
            print(f"{i}/{len(os.listdir(path_to_result))}: {result_file}                   ", end="\r")
            i += 1
            
            if data_source in result_file and result_file.endswith(".ts"):
                alg_name = result_file[:result_file.find(data_source)]
                alg_name = '_'.join(alg_name.split('_')[1:])[:-1]
                try:
                    official_name = mapping_alg[mapping_alg.folder == alg_name].official_name.values[0]
                    roc_auc, pr_auc = 0, 0
                    
                    if official_name == "GrammarViz":
                        result_alg = post_grammarviz(path_to_result + result_file)
                    else:
                        result_alg = np.loadtxt(path_to_result + result_file)
                        
                    result_alg = pd.DataFrame(result_alg, columns=["alg_anomaly_score"])
                    result_alg["alg_anomaly_score"] = result_alg["alg_anomaly_score"].astype(float)
                    result_alg = result_alg.fillna(0)
                    result_alg.replace(np.inf, 1, inplace=True)
                    result_alg.replace(-np.inf, 1, inplace=True)

                    label = pd.read_csv(label_file)
                    label = label[-len(result_alg):]
                    label = label.reset_index(drop=True)

                    if feasibility:
                        label_feasibility = pd.read_csv(feasibility)

                    total = pd.concat([result_alg, label], axis=1, join='inner')
                    if feasibility:
                        total = total[:len(total)-len(label_feasibility)]

                    total = total.reset_index(drop=True)
                    total = pd.concat([result_alg, label], axis=1, join='inner')
                    observed_values = total.value.values.tolist()
                    observed_values = [int(x) for x in observed_values]
                    to_keep_comparision = remove_long_shutdown(observed_values, THRESHOLD_INDICATING_SHUTDOWN, MISSING_VALUE)
                    total=total[total.index.isin(to_keep_comparision)]
                    
                    try:
                        roc_auc = metrics.roc_auc_score(total.is_anomaly, total.alg_anomaly_score)
                    except:
                        roc_auc = 0

                    try:
                        y, x, _ = metrics.precision_recall_curve(total.is_anomaly, total.alg_anomaly_score)
                        pr_auc = metrics.auc(x, y)
                    except:
                        pr_auc = 0
                    
                    if official_name not in roc_scores.keys():
                        roc_scores[official_name] = roc_auc
                        pr_scores[official_name] = pr_auc
                        roc_scores_file[official_name] = pr_scores_file[official_name] = result_file
                    else:
                        if roc_auc > roc_scores[official_name]:
                            roc_scores[official_name] = roc_auc
                            roc_scores_file[official_name] = result_file
                        if pr_auc > pr_scores[official_name]:
                            pr_scores[official_name] = pr_auc
                            pr_scores_file[official_name] = result_file
                        
                except:
                    #traceback.print_exc()
                    try:
                        if official_name not in roc_scores.keys():
                            roc_scores[official_name] = max(0, roc_auc)
                            pr_scores[official_name] = max(0, pr_auc)
                        else:
                            if roc_auc > roc_scores[official_name]:
                                roc_scores[official_name] = max(0, roc_auc)
                            if pr_auc > pr_scores[official_name]:
                                pr_scores[official_name] =max(0, pr_auc)
                    except:
                        pass
        
        return roc_scores, pr_scores, roc_scores_file, pr_scores
    except:
        pass

In [6]:
THRESHOLD_INDICATING_SHUTDOWN = 30
MISSING_VALUE = -999
OPERATION_VAL_RANGE = (713.682, 763.826)

path_to_result = "./Tide_pressure/"
label_file = "../../01_data/01_label/Tide_pressure.csv"
feasibility = "../../01_data/01_label/Tide_pressure.bechmark_stage.csv"
data_source = "Tide_Pressure"
calculate_roc(path_to_result, label_file, data_source, feasibility)

2276/2277: adapad_ts_bitmap_Tide_Pressure_00001.ts                              

({'ARIMA': 0.7860207229842742,
  'Bagel': 0.7957785424149357,
  'Donut': 0.7957098861398599,
  'DSPOT': 0.5187919522220085,
  'DWT-MLEAD': 0.6462584234848741,
  'EmsembleGI': 0.4259096776277822,
  'FFT': 0.5,
  'RForest': 0.8133695509128575,
  'XGBoosting': 0.761326249513887,
  'GrammarViz': 0.8472931776564172,
  'HealthESN': 0.7585371065071871,
  'HOT SAX': 0.5283501937640147,
  'IE-CAE': 0.36212687587888504,
  'Left STAMPi': 0.5019817881770495,
  'MedianMethod': 0.7746419514143524,
  'NormA-SJ': 0.5463036522203931,
  'NoveltySVR': 0.8996571867432814,
  'NumentaHTM': 0.7110416119051429,
  'OceanWNN': 0.7965840483939415,
  'PCI': 0.8680000088107926,
  'PS-SVM': 0.7884301076532012,
  'PST': 0.6932504188797664,
  'SAND': 0.8706817938186416,
  'SARIMA': 0.8549934433018153,
  'Series2Graph': 0.7253775057527133,
  'SR-CNN': 0.379717145041351,
  'SR': 0.8448213832063354,
  'SSA': 0,
  'STAMP': 0.5,
  'STOMP': 0.5247726670786614,
  'Sub-Fast-MCD': 0.8147588800544872,
  'Sub-IF': 0.70417769432

In [11]:
THRESHOLD_INDICATING_SHUTDOWN = 10
MISSING_VALUE = -999
OPERATION_VAL_RANGE = (-2, 32)

path_to_result = "./Seawater_temperature/"
label_file = "../../01_data/01_label/Seawater_temperature.csv"
feasibility = None
data_source = "C3_Temperature"
calculate_roc(path_to_result, label_file, data_source, feasibility)

2014/2014: grammarviz                   re_00000.ts                              

({'ARIMA': 0.5847758081136776,
  'Bagel': 0.7362847754216146,
  'Donut': 0.8569213990520775,
  'DSPOT': 0.4714939396152807,
  'DWT-MLEAD': 0.3411991967665034,
  'FFT': 0.5,
  'RForest': 0.9125540995803336,
  'XGBoosting': 0.912700688541211,
  'GrammarViz': 0.7170488414763989,
  'HealthESN': 0.8415157424606958,
  'IE-CAE': 0.6814439471533674,
  'Left STAMPi': 0.35587894441500095,
  'MedianMethod': 0.46428033301958543,
  'NormA-SJ': 0.3544871794871795,
  'NumentaHTM': 0.5046736215146653,
  'OceanWNN': 0.9127108163693076,
  'PCI': 0.5249028567895508,
  'PS-SVM': 0.7256234473209118,
  'PST': 0.533425480100247,
  'SAND': 0.45938873242881917,
  'Series2Graph': 0.6928483595372135,
  'SR': 0.5636396408558407,
  'SR-CNN': 0.6006786382809493,
  'SSA': 0.5195171137503506,
  'STAMP': 0.5,
  'STOMP': 0.283084689233631,
  'Sub-Fast-MCD': 0.8035627761327695,
  'Sub-IF': 0.7530192118232037,
  'Sub-LOF': 0.5307594204397464,
  'S-H-ESD': 0.5643279403452636,
  'Triple ES': 0.4663128718452334,
  'TSBitmap