In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
mapping_alg = pd.read_csv("algorithm_mapping.csv")
mapping_alg

Unnamed: 0,official_name,cite_key,folder
0,NoveltySVR,~\cite{ma2003online},novelty_svr
1,PS-SVM,~\cite{ma2003time},phasespace_svm
2,EmsembleGI,~\cite{gao2020ensemble},ensemble_gi
3,GrammarViz,~\cite{senin2015time},grammarviz3
4,HOT SAX,~\cite{keogh2005hot},hotsax
5,TSBitmap,~\cite{wei2005assumption},ts_bitmap
6,NormA-SJ,~\cite{boniol2021unsupervised},norma
7,SAND,~\cite{boniol2021sand},sand
8,Series2Graph,~\cite{boniol2022series2graph},series2graph
9,STAMP,~\cite{yeh2016matrix},stamp


In [3]:
def remove_long_shutdown(numbers, num_consecutive, missing_label):
    chunks = []
    current_chunk = []

    i = 0
    while i < len(numbers)-1:
        num = numbers[i]
        if num != missing_label:
            current_chunk.append(i)
        else:
            j = i+1
            while j < len(numbers):
                if numbers[j] == missing_label:
                    j += 1
                else:
                    break

            if j-i < num_consecutive:
                current_chunk += range(i,min(j+1, len(numbers)))# numbers[i:j+1]
            else:
                chunks.append(current_chunk)
                current_chunk = []

            i=j         

        i+= 1

    # Append the last chunk
    if current_chunk:
        chunks.append(current_chunk)

    to_ret = []
    # Print the chunks
    for i, chunk in enumerate(chunks, 1):
        to_ret += chunk
        
    return to_ret

In [4]:
from scipy.sparse import csc_matrix, hstack
import numpy as np
# code taken from https://github.com/HPI-Information-Systems/TimeEval-algorithms
def post_grammarviz(algorithm_parameter):
    if isinstance(algorithm_parameter, np.ndarray):
        results = pd.DataFrame(algorithm_parameter, columns=["index", "score", "length"])
        results = results.set_index("index")
    else:
        results = pd.read_csv(algorithm_parameter, header=None, index_col=0, names=["index", "score", "length"])
    anomalies = results[results["score"] > .0]

    # use scipy sparse matrix to save memory
    matrix = csc_matrix((len(results), 1), dtype=np.float64)
    counts = np.zeros(len(results))
    for i, row in anomalies.iterrows():
        idx = int(row.name)
        length = int(row["length"])
        tmp = np.zeros(len(results))
        tmp[idx:idx + length] = np.repeat([row["score"]], repeats=length)
        tmp = tmp.reshape(-1, 1)
        matrix = hstack([matrix, tmp])
        counts[idx:idx + length] += 1
    sums = matrix.sum(axis=1)
    counts = counts.reshape(-1, 1)
    scores = np.zeros_like(sums)
    np.divide(sums, counts, out=scores, where=counts != 0)
    # returns the completely flattened array (from `[[1.2], [2.3]]` to `[1.2, 2.3]`)
    return scores.A1  # type: ignore

In [6]:
import os
import pandas as pd
import numpy as np
import sklearn
from sklearn import metrics
import traceback
import math
from scipy.sparse import csc_matrix, hstack

# code taken from https://github.com/HPI-Information-Systems/TimeEval-algorithms
def post_grammarviz(algorithm_parameter):
    if isinstance(algorithm_parameter, np.ndarray):
        results = pd.DataFrame(algorithm_parameter, columns=["index", "score", "length"])
        results = results.set_index("index")
    else:
        results = pd.read_csv(algorithm_parameter, header=None, index_col=0, names=["index", "score", "length"])
    anomalies = results[results["score"] > .0]

    # use scipy sparse matrix to save memory
    matrix = csc_matrix((len(results), 1), dtype=np.float64)
    counts = np.zeros(len(results))
    for i, row in anomalies.iterrows():
        idx = int(row.name)
        length = int(row["length"])
        tmp = np.zeros(len(results))
        tmp[idx:idx + length] = np.repeat([row["score"]], repeats=length)
        tmp = tmp.reshape(-1, 1)
        matrix = hstack([matrix, tmp])
        counts[idx:idx + length] += 1
    sums = matrix.sum(axis=1)
    counts = counts.reshape(-1, 1)
    scores = np.zeros_like(sums)
    np.divide(sums, counts, out=scores, where=counts != 0)
    # returns the completely flattened array (from `[[1.2], [2.3]]` to `[1.2, 2.3]`)
    return scores.A1  # type: ignore

def calculate_roc(path_to_result, label_file, data_source, feasibility=None, validation=None):
    try:
        roc_scores = dict()
        roc_scores_file = dict()
        pr_scores = dict()
        pr_scores_file = dict()
        
        i = 0
        for result_file in os.listdir(path_to_result):
            print(f"{i}/{len(os.listdir(path_to_result))}: {result_file}                   ", end="\r")
            i += 1
            
            if data_source in result_file and result_file.endswith(".ts"):
                alg_name = result_file[:result_file.find(data_source)]
                alg_name = '_'.join(alg_name.split('_')[1:])[:-1]
                try:
                    official_name = mapping_alg[mapping_alg.folder == alg_name].official_name.values[0]
                    roc_auc, pr_auc = 0, 0
                    
                    if official_name == "GrammarViz":
                        result_alg = post_grammarviz(path_to_result + result_file)
                    else:
                        result_alg = np.loadtxt(path_to_result + result_file)
                        
                    result_alg = pd.DataFrame(result_alg, columns=["alg_anomaly_score"])
                    result_alg["alg_anomaly_score"] = result_alg["alg_anomaly_score"].astype(float)
                    result_alg = result_alg.fillna(0)
                    result_alg.replace(np.inf, 1, inplace=True)
                    result_alg.replace(-np.inf, 1, inplace=True)

                    label = pd.read_csv(label_file)
                    label = label[-len(result_alg):]
                    label = label.reset_index(drop=True)

                    if feasibility:
                        label_feasibility = pd.read_csv(feasibility)

                    total = pd.concat([result_alg, label], axis=1, join='inner')
                    if feasibility:
                        total = total[:len(total)-len(label_feasibility)]

                    total = total.reset_index(drop=True)
                    total = pd.concat([result_alg, label], axis=1, join='inner')
                    observed_values = total.value.values.tolist()
                    observed_values = [int(x) for x in observed_values]
                    to_keep_comparision = remove_long_shutdown(observed_values, THRESHOLD_INDICATING_SHUTDOWN, MISSING_VALUE)
                    total=total[total.index.isin(to_keep_comparision)]
                    
                    try:
                        roc_auc = metrics.roc_auc_score(total.is_anomaly, total.alg_anomaly_score)
                    except:
                        roc_auc = 0

                    try:
                        y, x, _ = metrics.precision_recall_curve(total.is_anomaly, total.alg_anomaly_score)
                        pr_auc = metrics.auc(x, y)
                    except:
                        pr_auc = 0
                    
                    if official_name not in roc_scores.keys():
                        roc_scores[official_name] = roc_auc
                        pr_scores[official_name] = pr_auc
                        roc_scores_file[official_name] = pr_scores_file[official_name] = result_file
                    else:
                        if roc_auc > roc_scores[official_name]:
                            roc_scores[official_name] = roc_auc
                            roc_scores_file[official_name] = result_file
                        if pr_auc > pr_scores[official_name]:
                            pr_scores[official_name] = pr_auc
                            pr_scores_file[official_name] = result_file
                        
                except:
                    #traceback.print_exc()
                    try:
                        if official_name not in roc_scores.keys():
                            roc_scores[official_name] = max(0, roc_auc)
                            pr_scores[official_name] = max(0, pr_auc)
                        else:
                            if roc_auc > roc_scores[official_name]:
                                roc_scores[official_name] = max(0, roc_auc)
                            if pr_auc > pr_scores[official_name]:
                                pr_scores[official_name] =max(0, pr_auc)
                    except:
                        pass
        
        return roc_scores, pr_scores
    except:
        pass

In [9]:
THRESHOLD_INDICATING_SHUTDOWN = 30
MISSING_VALUE = -999
OPERATION_VAL_RANGE = (713.682, 763.826)

path_to_result = "./experiments/"
label_file = "../../01_data/01_label/Tide_pressure.csv"
feasibility = "../../01_data/01_label/Tide_pressure.bechmark_stage.csv"
validation = None
data_source = "Tide_Pressure"
calculate_roc(path_to_result, label_file, data_source, feasibility,validation)

197/198: adapad_subsequence_fast_mcd_Tide_Pressure_increased_00000.ts                   

({'Bagel': 0.7407473406139005,
  'Donut': 0.7911282125190542,
  'RForest': 0.8066295907280518,
  'XGBoosting': 0.8085833485283902,
  'HealthESN': 0.7704684949961799,
  'IE-CAE': 0.414203434936776,
  'OceanWNN': 0.772076031543431,
  'SR-CNN': 0.3793695834645634,
  'Sub-Fast-MCD': 0.8283779156972777},
 {'Bagel': 0.4793123512255489,
  'Donut': 0.5735143263891832,
  'RForest': 0.6131089646048136,
  'XGBoosting': 0.6132794167545834,
  'HealthESN': 0.15188696643360797,
  'IE-CAE': 0.005316275183668032,
  'OceanWNN': 0.6144810954957423,
  'SR-CNN': 0.021933582116022192,
  'Sub-Fast-MCD': 0.059507383203809994})