In [1]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import pandas as pd
import sys
sys.path.append('../../')
from omegaconf import OmegaConf
from dacite import from_dict

from tsbench.tslib.traindataset_generator import TimeSeriesTrainDatasetGeneratorConfig, TimeSeriesTrainDatasetGenerator
from tsbench.data.tslibwrapper import TsLibDatasetGenerator
from tsbench.data import get_datasetgenerator
from tsbench.ml_utils.config import NameAndKwargs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import numpy as np
from omegaconf import OmegaConf
from repo import REPO


# The tslibwrapper

The config is basically the same, as for the tslib TrainDatasetGenerator. We just need to wrap the config into a NameAndKwargs dataclass and specify the name `tslib` as dataset. See `tsbench/data/__init__.py`.

In [3]:
# this is only a minimal config and the time series are not normalized

cfg = """ 
name: tslib
kwargs:
  pipeline:
    dataset:
      name: csvloader
      kwargs:
        data_file: ../../datafiles/har_with_smartphones/train.csv
        meta_columns: [subject, Activity]
    windowing:
      window_size: 20 # each time series for the model will have length 10
      stride: 5 # each time series will be shifted by 5
    # TODO add normalization of the features
    # store normalizer values in a file and load them in the normalizer
    target_generator:
      name: csv_classification
      kwargs:
        class_column: Activity
        class_labels: ['STANDING', 'SITTING', 'LAYING', 'WALKING', 'WALKING_DOWNSTAIRS', 'WALKING_UPSTAIRS']
  
  split: 
    name: random_split
    kwargs:
      lengths: [0.8, 0.2] # train, val
"""
cfg = OmegaConf.create(cfg)

cfg = from_dict(data_class=NameAndKwargs, data=OmegaConf.to_container(cfg))

Now again, we explore some basic properties of the class `TsLibDataset`. This time not many comments, it is pretty same as before. Feel free to explore.

In [4]:
ds_gen = get_datasetgenerator(cfg)
ds_gen.generate_dataset()

Sample: 100%|████████████████████████████████████████████████████████████████| 126/126 [00:00<00:00, 414.64it/s]
Sample: 100%|████████████████████████████████████████████████████████████████| 126/126 [00:00<00:00, 423.94it/s]
Generating window index: 100%|██████████████████████████████████████████████| 126/126 [00:00<00:00, 7669.50it/s]
Total number of dropped timesteps due to windowing: 272


In [5]:
ds_gen.input_dim, ds_gen.output_dim, ds_gen.context_length

((561,), (6, 1), 20)

In [6]:
train_ds = ds_gen.train_split

In [7]:
train_ds

<tsbench.data.tslibwrapper.TsLibDataset at 0x7f9848117040>

In [8]:
train_ds[0]

(tensor([[-0.4581, -0.1030, -0.3039,  ..., -0.9861, -0.9724, -0.9806],
         [-0.4550, -0.1039, -0.3058,  ..., -0.9945, -0.9772, -0.9829],
         [ 0.1062, -0.3395, -0.5171,  ..., -0.9321, -0.8889, -0.9153],
         ...,
         [-0.6517, -0.0929, -0.1359,  ..., -0.9110, -0.7530, -0.5462],
         [-0.7156, -0.0548, -0.1137,  ..., -0.9908, -0.9478, -0.9370],
         [-0.6989, -0.0670, -0.1167,  ..., -0.9908, -0.9776, -0.9826]]),
 tensor(1))

In [9]:
ds_gen.train_metrics

MetricCollection
)

In [10]:
ds_gen.validation_metrics

MetricCollection
)

# Big Config and running jobs

The ultimate goal would be to create config file which will do the training. An example of such config is below.

Before running cell below make sure you:
- have `repo.py` in the same directory with you notebook
- have directory `configs` in the repository   

In [11]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: yourname
  gpu_ids: [0]
  runs_per_gpu: 4
  use_cuda_visible_devices: True
  shuffle_configs: True

seeds: [0]

# sweep:
#   type: grid
#   axes: 
#   - parameter: model.kwargs.optimizer.lr
#     vals: [1e-3]
#   - parameter: data.dl_kwargs.batch_size
#     vals: [256]
#   - parameter: model.optimizer.weight_decay
#     vals: [0.0, 1e-2, 1e-1]

start_num: 0
config:
  experiment_data:
    entity: fslgroup
    project_name: tsbench
    experiment_tag: 'tslib-${config.model.kwargs.block.kwargs.sequence_mix.name}'
    experiment_type: lr_${start_num}
    experiment_name: ${config.experiment_data.experiment_tag}--${config.experiment_data.experiment_type}
    experiment_dir: null
    # output_dir: ./outputs
    experiment_notes: 
    seed: 0
    gpu_id: 0
   
  model:
    name: sequence_transformer
    kwargs:
      num_layers: 3
      embedding_dim: 64
      dropout: 0.0
      bias: True
      block:
        name: prenorm_block
        kwargs:
          feedforward:
            name: ff
          sequence_mix:
            name: causalselfattention
            kwargs:
              num_heads: 4
              use_flash: True

      encoder:
        name: linear
      decoder:
        name: sequence
        kwargs:
          agg_mode: pool
          use_lengths: False

  loss:
    name: crossentropy_sequence
            
  trainer:
    n_steps: 100 #20000 #20
    val_every: 10
    save_every: 25e3 
    early_stopping_patience: 50000 #250 #1000
    num_workers: 4
    gradient_clip_norm: 10.0 #1.0
    training_strategy:
      enable_mixed_precision: True
      precision: bfloat16
      use_torch_compile: False
    gradient_accumulation_steps: 1 #16
    optimizer:
      name: AdamW
      kwargs:
        lr: 1e-3
        weight_decay: 0.1
        betas: [0.9, 0.99]
    lr_scheduler_step: step
    lr_scheduler: # this scheduler has warumup and is then constant
      name: warmup_cosine_annealing
      kwargs:
        warmup_steps: 100
        decay_until_step: 100 #1500 #3000
        min_lr: ${config.trainer.optimizer.kwargs.lr} # same as initial lr

  data: 
    name: tslib
    kwargs:
      pipeline:
        dataset:
          name: csvloader
          kwargs:
            data_file: ../../datafiles/har_with_smartphones/train.csv
            meta_columns: [subject, Activity]
        windowing:
          window_size: 20 # each time series for the model will have length 10
          stride: 5 # each time series will be shifted by 5
        normalizer: 
          normalizer_file: ../../datafiles/har_with_smartphones/normalizer.json
        target_generator:
          name: csv_classification
          kwargs:
            class_column: Activity
            class_labels: ['STANDING', 'SITTING', 'LAYING', 'WALKING', 'WALKING_DOWNSTAIRS', 'WALKING_UPSTAIRS']
        cache_processed_dataset: True 
      
      split: 
        name: random_split
        kwargs:
          lengths: [0.8, 0.2] # train, val

    stateful_train_dataset: True
    dl_kwargs:
      batch_size: 256 #32
      shuffle: True
"""
cfg = OmegaConf.create(config_yaml)
run_command = REPO.create_experiment(cfg, override=True)
print(run_command)

CUDA_VISIBLE_DEVICES=0 python run.py --config-name tslib-causalselfattention--lr_0.yaml


The command printed above can be copypasted in your terminal to start the job.

<hr>
Some short explanations about config sections.

- **sweep**: Specifies that a hyperparameter sweep will be performed.

  - *type*: Specifies the type of sweep, and in this case, it's a grid search. 
      available types are grid, line, random, random_grid. See `~/tsbench/tsbench/ml_utils/run_utils/sweep.py` to learn about them. 

  - *axes*: Describes the hyperparameters to be tuned and their possible values.

    - *parameter*: Indicates the hyperparameter being tuned.

    - *vals*: Specifies the different values that the hyperparameter will take during the sweep.

- **run_config:** Specifies the configuration for the execution, including parallelization, GPU settings, and seed.

- **seeds:** List of random seeds.

- **start_num:** Starting number for the experiment.

- **config:** The main configuration section.

  - *experiment_data:* Information about the experiment, including entity, project name, tags, and GPU settings.

  - *model:* Configuration for the model architecture, specifying the type, layers, dimensions, and other relevant parameters.

  - *loss:* Specifies the loss function for the model.

  - *trainer:* Configuration for the training process, including steps, validation frequency, save frequency, early stopping, optimizer, learning rate scheduler, and other training-related parameters.

  - *data:* Configuration for the dataset, including dataset type, data loading, windowing, normalization, target generation, and dataset splitting.

    - *pipeline:* Configuration for the data processing pipeline, including dataset loading, windowing, normalization, target generation, and caching.

    - *split:* Configuration for dataset splitting, specifying the split type and lengths.

    - *stateful_train_dataset:* Flag indicating whether the training dataset is stateful.

    - *dl_kwargs:* Configuration for the data loader, including batch size and shuffling.