# Yahoo Webscope S5 Dataset

In [11]:
import pandas as pd
import os
import shutil
from typing import Final
from pathlib import Path
from collections.abc import Callable
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets

In [12]:
dataset_collection_name = "WebscopeS5"
source_folder = Path(data_raw_folder) / "Webscope-S5/data"
target_folder = Path(data_processed_folder)

print(f"Looking for source datasets in {source_folder.absolute()} and\nsaving processed datasets in {target_folder.absolute()}")

Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data and
saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed


file handling and transformations

In [13]:
def list_regular_folders(path: str) -> list[str]:
    files = [os.path.join(path, f) for f in os.listdir(path)]
    return [f for f in files if os.path.isdir(f) and not os.path.basename(f).startswith(".")]

def calc_size(filename: str) -> int:
    with open(filename, 'r') as f:
        next(f) # skips header
        c = 0
        for line in f:
            c += 1
    return c

def transform_datetime_column(source: str, target: str, column: str = "timestamp") -> None:
    df = pd.read_csv(source)
    df["timestamp"] = pd.to_datetime(df['timestamp'], unit='h')
    df.to_csv(target, index=False)

def transform_copy(source: str, target: str) -> None:
    try:
        shutil.copy(source, target)
    except e:
        print(f"Unable to copy file. {e}")

In [14]:
# shared by all datasets
train_is_normal = False
input_type = "univariate"
datetime_index = True

# create target directory
dataset_subfolder = os.path.join(input_type, dataset_collection_name)
target_subfolder = os.path.join(target_folder, dataset_subfolder)
try:
    os.makedirs(target_subfolder)
    print(f"Created directories {target_subfolder}")
except FileExistsError:
    print(f"Directories {target_subfolder} already exist")
    pass

dm = Datasets(target_folder)
benchmark_paths = {}
for benchmark in list_regular_folders(source_folder):
    benchmark_paths[os.path.basename(benchmark)] = benchmark
benchmark_paths

Directories /home/projects/akita/data/benchmark-data/data-processed/univariate/WebscopeS5 already exist


{'A2Benchmark': '/home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A2Benchmark',
 'A3Benchmark': '/home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A3Benchmark',
 'A4Benchmark': '/home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A4Benchmark',
 'A1Benchmark': '/home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A1Benchmark'}

## A1Benchmark and A2Benchmark datasets

In [15]:
# shared by the datasets from A1 and A2
train_type = "unsupervised"

for benchmark, dataset_type in [("A1Benchmark", "real"), ("A2Benchmark", "synthetic")]:
    # dataset transformation
    transform_file: Callable[[str, str], None] = transform_datetime_column if benchmark == "A1Benchmark" else transform_copy

    for f in os.listdir(benchmark_paths[benchmark]):
        source_file = os.path.join(benchmark_paths[benchmark], f)
        if (os.path.isfile(source_file) and not f == ".json"):
            # get file number for target filename
            dataset_number = os.path.splitext(os.path.basename(f))[0].split("_")[1]
            filename = f"{benchmark}-{dataset_number}.test.csv"

            # save metadata
            dataset_name = filename.split(".")[0]
            path = os.path.join(dataset_subfolder, filename)
            target_filepath = os.path.join(target_subfolder, filename)
            dataset_length = calc_size(source_file)
            dm.add_dataset((dataset_collection_name, dataset_name),
                train_path = None,
                test_path = path,
                dataset_type = dataset_type,
                datetime_index = datetime_index,
                split_at = None,
                train_type = train_type,
                train_is_normal = train_is_normal,
                input_type = input_type,
                dataset_length = dataset_length
            )
            # transform file
            transform_file(source_file, target_filepath)
            print(f"Processed source dataset {source_file} -> {target_filepath}")
        else:
            print(f"Skipped source {source_file}")

# save metadata of benchmark
dm.save()

Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A1Benchmark/real_6.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/WebscopeS5/A1Benchmark-6.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A1Benchmark/real_3.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/WebscopeS5/A1Benchmark-3.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A1Benchmark/real_40.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/WebscopeS5/A1Benchmark-40.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A1Benchmark/real_20.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/WebscopeS5/A1Benchmark-20.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A1Benchmark/real_4.csv -> /h

## A3Benchmark and A4Benchmark datasets

In [19]:
# shared by the datasets from A3 and A4
dataset_type = "synthetic"
train_type = "unsupervised"
train_is_normal = False
input_type = "univariate"
datetime_index = True

# dataset transformation
def transform_clean(source: str, target: str) -> None:
    df = pd.read_csv(source)
    # parse as unix timestamp in seconds
    df["timestamp"] = pd.to_datetime(df["timestamps"], unit='s')
    # both anomalies and changepoints are anomalous
    df["is_anomaly"] = df["anomaly"] | df["changepoint"]
    # remove information about seasonality and trend
    df = df[["timestamp", "value", "is_anomaly"]]
    df.to_csv(target, index=False)

for benchmark in ["A3Benchmark", "A4Benchmark"]:
    # dataset transformation
    transform_file: Callable[[str, str], None] = transform_clean

    for f in os.listdir(benchmark_paths[benchmark]):
        source_file = os.path.join(benchmark_paths[benchmark], f)
        if (os.path.isfile(source_file) and not f.endswith("all.csv")):
            # get file number for target filename
            dataset_number = os.path.splitext(os.path.basename(f))[0].split("-")[1][2:]
            filename = f"{benchmark}-{dataset_number}.test.csv"

            # save metadata
            dataset_name = filename.split(".")[0]
            path = os.path.join(dataset_subfolder, filename)
            target_filepath = os.path.join(target_subfolder, filename)
            dataset_length = calc_size(source_file)
            dm.add_dataset((dataset_collection_name, dataset_name),
                train_path = None,
                test_path = path,
                dataset_type = dataset_type,
                datetime_index = datetime_index,
                split_at = None,
                train_type = train_type,
                train_is_normal = train_is_normal,
                input_type = input_type,
                dataset_length = dataset_length
            )
            # transform file
            transform_file(source_file, target_filepath)
            print(f"Processed source dataset {source_file} -> {target_filepath}")
        else:
            print(f"Skipped source {source_file}")

# save metadata of benchmark
dm.save()

Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A3Benchmark/A3Benchmark-TS76.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/WebscopeS5/A3Benchmark-76.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A3Benchmark/A3Benchmark-TS67.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/WebscopeS5/A3Benchmark-67.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A3Benchmark/A3Benchmark-TS97.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/WebscopeS5/A3Benchmark-97.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Webscope-S5/data/A3Benchmark/A3Benchmark-TS98.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/WebscopeS5/A3Benchmark-98.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Websc

In [20]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name, dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
WebscopeS5,A1Benchmark-1,,univariate/WebscopeS5/A1Benchmark-1.test.csv,real,True,,unsupervised,False,univariate,1420
WebscopeS5,A1Benchmark-10,,univariate/WebscopeS5/A1Benchmark-10.test.csv,real,True,,unsupervised,False,univariate,1439
WebscopeS5,A1Benchmark-11,,univariate/WebscopeS5/A1Benchmark-11.test.csv,real,True,,unsupervised,False,univariate,1439
WebscopeS5,A1Benchmark-12,,univariate/WebscopeS5/A1Benchmark-12.test.csv,real,True,,unsupervised,False,univariate,1439
WebscopeS5,A1Benchmark-13,,univariate/WebscopeS5/A1Benchmark-13.test.csv,real,True,,unsupervised,False,univariate,1439
WebscopeS5,...,...,...,...,...,...,...,...,...,...
WebscopeS5,A4Benchmark-95,,univariate/WebscopeS5/A4Benchmark-95.test.csv,synthetic,True,,unsupervised,False,univariate,1680
WebscopeS5,A4Benchmark-96,,univariate/WebscopeS5/A4Benchmark-96.test.csv,synthetic,True,,unsupervised,False,univariate,1680
WebscopeS5,A4Benchmark-97,,univariate/WebscopeS5/A4Benchmark-97.test.csv,synthetic,True,,unsupervised,False,univariate,1680
WebscopeS5,A4Benchmark-98,,univariate/WebscopeS5/A4Benchmark-98.test.csv,synthetic,True,,unsupervised,False,univariate,1680


In [21]:
print(f"Datasets\t{len(dm._df)}")
points = int(dm._df["length"].sum())
print(f"Combined length\t{points}")

Datasets	733
Combined length	123542562


## Testing code

In [None]:
with DatasetMetadata(".") as dm:
    dm.add_datasets(
        dataset_name,
        dataset_collection_name,
        None,
        path,
        "real",
        datetime_index,
        split_at,
        train_type,
        train_is_normal,
        input_type,
        dataset_length
    )
dm.refresh()
data = [{
    "dataset_name": dataset_name,
    "collection_name": dataset_collection_name,
    "test_path": path,
    "dataset_type": dataset_type,
    "datetime_index": datetime_index,
    "split_at": split_at,
    "train_type": "unknown",
    "train_is_normal": train_is_normal,
    "input_type": input_type,
    "length": dataset_length
},{
    "dataset_name": "A1Benchmark-18",
    "collection_name": dataset_collection_name,
    "test_path": path,
    "dataset_type": dataset_type,
    "datetime_index": datetime_index,
    "split_at": split_at,
    "train_type": train_type,
    "train_is_normal": train_is_normal,
    "input_type": input_type,
    "length": dataset_length
}]
dm.add_datasets(data)
dm._df