# GutenTAG data generator

This code allows you to generate new, synthetic datasets. The degrees of freedom are endless. You can choose the base oscillation, the number and type of anomalies, where they will be present in your time series, etc...

## Install necessary libraries

In [10]:
!pip install gutenTAG

[31mERROR: Could not find a version that satisfies the requirement gutenTAG (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for gutenTAG[0m[31m
[0m

## Import libraries

In [11]:
from gutenTAG import GutenTAG, TrainingType
import random
import json
import os

## Create the folder to save the datasets

In [12]:
output_folder = "data/generated_data"
os.makedirs(output_folder, exist_ok=True)
infos_folder = "data/generated_data/info_generated_data"
os.makedirs(infos_folder, exist_ok=True)

## Fixed configurations

In [13]:
num_datasets = 20
length = 10000
base_oscillations = [{"kind": "cylinder-bell-funnel"}]
anomalies_positions = ["beginning", "middle", "end"]

min_anomalies_per_dataset = 1
max_anomalies_per_dataset = 5

# Dataset Generation Stats for posterior analysis
stats_anomaly_position = {"beginning": 0, "middle": 0, "end": 0}
stats_anomaly_kind = {"amplitude": 0, "pattern": 0, "platform": 0, "trend": 0}

# Seeds for reproducibility
random.seed(42)
random_seeds = [random.randint(0, int(50*num_datasets)) for _ in range(num_datasets)]

In [14]:
for i in range(num_datasets):
    random.seed(random_seeds[i])
    name = "dataset_" + str(i)
    anomalies = []
    for _ in range(random.randint(min_anomalies_per_dataset, max_anomalies_per_dataset)):
        anomalies_kinds = [{"kind": "amplitude", "amplitude_factor": random.uniform(1.5, 5.0)},
                           {"kind": "pattern", "cbf_pattern_factor": random.uniform(5.0, 20.0)},
                           {"kind": "platform", "value": random.uniform(0.0, 5.0)},
                           {"kind": "trend", "oscillation": {"kind": "cylinder-bell-funnel"}}]
        anomalies.append({"position": anomalies_positions[random.randint(0, 2)], "length": int(10*random.randint(1, 20)/2), "kinds": [anomalies_kinds[random.randint(0, len(anomalies_kinds)-1)]]})

    config = {
        "timeseries": [
            {
                "name": name,
                "length": length,
                "base-oscillations": base_oscillations,
                "anomalies": anomalies
            }
        ]
    }
    gutentag = GutenTAG(seed=random.randint(0, 100))
    gutentag.load_config_dict(config)

    # call generate() to create the datasets (in-memory)
    datasets = gutentag.generate(return_timeseries=True)

    # we only defined a single test time series
    assert len(datasets) == 1
    d = datasets[0]
    assert d.name == name
    assert d.training_type == TrainingType.TEST

    # the data points are stored at
    df = d.timeseries
    df.iloc[:, 1:-1]

    # Add a 'timestamp' column as the first column
    df.insert(0, 'timestamp', range(len(df)))

    # Data stats
    for anomaly in config['timeseries'][0]['anomalies']:
        stats_anomaly_position[anomaly['position']] += 1
        stats_anomaly_kind[anomaly['kinds'][0]['kind']] += 1

    # Save dataset info to a TXT file
    file_path = os.path.join(infos_folder, f"{name}.txt")
    with open(file_path, "w") as file:
        json.dump(config["timeseries"][0], file, indent=4)

    # Save the dataset to a CSV file
    output_file = os.path.join(output_folder, f"{name}.test.csv")
    df.to_csv(output_file, index=False)    

Initializing addons: 0it [00:00, ?it/s]
Generating datasets:   0%|          | 0/1 [00:00<?, ?it/s]
Finalizing addons: 0it [00:00, ?it/s]
Initializing addons: 0it [00:00, ?it/s]
Generating datasets:   0%|          | 0/1 [00:00<?, ?it/s]
Finalizing addons: 0it [00:00, ?it/s]
Initializing addons: 0it [00:00, ?it/s]
Generating datasets:   0%|          | 0/1 [00:00<?, ?it/s]
Finalizing addons: 0it [00:00, ?it/s]
Initializing addons: 0it [00:00, ?it/s]
Generating datasets:   0%|          | 0/1 [00:00<?, ?it/s]
Finalizing addons: 0it [00:00, ?it/s]
Initializing addons: 0it [00:00, ?it/s]
Generating datasets:   0%|          | 0/1 [00:00<?, ?it/s]
Finalizing addons: 0it [00:00, ?it/s]
Initializing addons: 0it [00:00, ?it/s]
Generating datasets:   0%|          | 0/1 [00:00<?, ?it/s]
Finalizing addons: 0it [00:00, ?it/s]
Initializing addons: 0it [00:00, ?it/s]
Generating datasets:   0%|          | 0/1 [00:00<?, ?it/s]
Finalizing addons: 0it [00:00, ?it/s]
Initializing addons: 0it [00:00, ?it/s]
G

## Stats

In [15]:
stats_anomaly_position

{'beginning': 17, 'middle': 17, 'end': 20}

In [16]:
stats_anomaly_kind

{'amplitude': 18, 'pattern': 14, 'platform': 9, 'trend': 13}