# Code to obtain scores (plain vanilla)

This code only serves as a tutorial of how to set up and run the different algorithms. No need to run it if the performance_evaluation, gridsearch and ensemble notebooks work.

## Install necessary libraries

In [12]:
!pip install docker
!pip install timeeval --no-deps
!pip install durations numpyencoder distributed prts

    scikit-learn (>=0.24.*)
                  ~~~~~~~^[0m[33m
    scikit-learn (>=0.24.*)
                  ~~~~~~~^[0m[33m
[0m

## Import libraries

In [13]:
import os
import json
import subprocess
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from timeeval.utils.window import ReverseWindowing

## Check if Docker is installed and running (if not install it from here: https://docs.docker.com/engine/install/ and open the app to allow Docker to run in the background)

In [1]:
def verify_docker():
    if not shutil.which("docker"):
        raise EnvironmentError("Docker is not installed or not in your PATH. Please install Docker.")

## Build Docker images

In [None]:
def pull_docker_images():
    images = [
        "ghcr.io/timeeval/cblof:0.3.0"
    ]

    # Build Docker images
    for image in images:
        try:
            subprocess.run(["docker", "pull", "--platform", "linux/amd64", image], check=True)
            print(f"Successfully pulled {image}.")
        except subprocess.CalledProcessError as e:
            print(f"Failed to pull {image}: {e}")


pull_docker_images()

## List the algorithms you wish to run with there respective Docker images (can be found at https://github.com/TimeEval/TimeEval-algorithms)

In [14]:
ALGORITHMS = {"CBLOF": "ghcr.io/timeeval/cblof:0.3.0"}

## General function allowing you to run the algorithms

In [15]:
def run_algorithm(algorithm_name, data_path, execution_type="execute", custom_parameters=None):

    # Check if Docker image of algorithm was specified
    if algorithm_name not in ALGORITHMS:
        raise ValueError(f"Algorithm '{algorithm_name}' not found in ALGORITHMS dictionary.")

    # Construct configuration
    config = {
        "executionType": execution_type,
        "dataInput": f"/data/{data_path}",
        "dataOutput": "/results/anomaly_scores.csv",
        "modelInput": "/results/model.pkl" if execution_type == "train" else "/results/trained_model.pkl",
        "modelOutput": "/results/trained_model.pkl" if execution_type == "train" else "/results/model.pkl",
        "customParameters": custom_parameters or {}
    }
    
    config_json = json.dumps(config)

    # Build Docker command
    cmd = [
        "docker", "run", "--rm",
        "-v", f"{os.getcwd()}/data:/data:ro",
        "-v", f"{os.getcwd()}/results:/results:rw",
        "--platform", "linux/amd64",  # Ensures compatibility on ARM machines (might need to be adjusted depending on the architecture)!
        ALGORITHMS[algorithm_name],
        "execute-algorithm",
        config_json
    ]

    # Execute Docker command
    result = subprocess.run(cmd, capture_output=True, text=True)

    # Check for errors
    if result.returncode != 0:
        print(f"Error running {algorithm_name}: {result.stderr}")
        raise RuntimeError(f"Execution failed for {algorithm_name}")

    # Load and return anomaly scores
    scores_path = "./results/anomaly_scores.csv"
    scores = np.loadtxt(scores_path, delimiter=",")
    return scores

## Function that evaluates the algorithm based on the produced anomaly scores

In [16]:
def evaluate_algorithm(scores, data_path):
    # Load test data
    data_test = pd.read_csv(data_path)
    
    # Ensure 'is_anomaly' column exists in the data
    if 'is_anomaly' not in data_test.columns:
        raise ValueError(f"The test data at {data_path} must contain an 'is_anomaly' column.")

    # Extract the anomaly labels
    anomalies = data_test['is_anomaly']

    # Calculate AUC-ROC and AUC-PR
    auc_roc = roc_auc_score(anomalies, scores)
    precision, recall, _ = precision_recall_curve(anomalies, scores)
    auc_pr = auc(recall, precision)
    return auc_roc, auc_pr

In [17]:
# Post-processing for STAMP
def post_stamp(scores: np.ndarray, args: dict) -> np.ndarray:
    window_size = args.get("hyper_params", {}).get("anomaly_window_size", 30)
    if window_size < 4:
      print("WARN: window_size must be at least 4. Dynamically fixing it by setting window_size to 4")
      window_size = 4
    return ReverseWindowing(window_size=window_size).fit_transform(scores)

# Post-processing for STOMP
def post_stomp(scores: np.ndarray, args: dict) -> np.ndarray:
    window_size = args.get("hyper_params", {}).get("anomaly_window_size", 30)
    if window_size < 4:
      print("WARN: window_size must be at least 4. Dynamically fixing it by setting window_size to 4")
      window_size = 4
    return ReverseWindowing(window_size=window_size).fit_transform(scores)

# Post-processing for Sub-LOF
def post_sLOF(scores: np.ndarray, args: dict) -> np.ndarray:
    window_size = args.get("hyper_params", {}).get("window_size", 100)
    return ReverseWindowing(window_size=window_size).fit_transform(scores)

# Post-processing for VALMOD
def post_valmod(scores: np.ndarray, args: dict) -> np.ndarray:
    window_min = args.get("hyper_params", {}).get("min_anomaly_window_size", 30)
    window_min = max(window_min, 4)
    return ReverseWindowing(window_size=window_min).fit_transform(scores)

## Example run for CBLOF

In [18]:
# Run CBLOF on an example dataset
try:
    anomaly_scores = run_algorithm(
        algorithm_name="CBLOF",
        data_path="GutenTAG/cbf-trend-quadratic/test.csv",
        execution_type="execute",
        custom_parameters={
            "n_clusters": 50,
            "alpha": 0.9,
            "beta": 5,
            "use_weights": False
        }
    )
    print("Anomaly scores:", anomaly_scores)
except Exception as e:
    print(f"Error: {e}")

Anomaly scores: [0.72792853 0.68567176 0.78575383 ... 1.01723518 1.22767215 1.0791532 ]
