In [1]:
%load_ext autoreload
%autoreload 2
import torch
import numpy as np
import pandas as pd
from omegaconf import OmegaConf
# plotting
# mycode
from repo import REPO
import sys
sys.path.append('../..')

# Experiments on sCIFAR

## Setup Transformer with Attention

Train a transformer on sequential cifar.

In [2]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: fat3
  gpu_ids: [4]
  runs_per_gpu: 4
  use_cuda_visible_devices: True
  shuffle_configs: True

seeds: [0,13,7]

# sweep:
#   type: grid
#   axes: 
#   - parameter: model.kwargs.optimizer.lr
#     vals: [1e-3]
#   - parameter: data.dl_kwargs.batch_size
#     vals: [256]
#   - parameter: model.optimizer.weight_decay
#     vals: [0.0, 1e-2, 1e-1]

start_num: 0
config:
  experiment_data:
    entity: fslgroup
    project_name: tsbench
    experiment_tag: 'sCF10-${config.model.kwargs.block.kwargs.sequence_mix.name}'
    experiment_type: lr_${start_num}
    experiment_name: ${config.experiment_data.experiment_tag}--${config.experiment_data.experiment_type}
    experiment_dir: null
    # output_dir: ${oc.env:OUTPUT_DIR}
    experiment_notes: 
    seed: 0
    gpu_id: 0
   
  model:
    name: sequence_transformer
    kwargs:
      num_layers: 3
      embedding_dim: 64
      dropout: 0.0
      bias: True
      block:
        name: prenorm_block
        kwargs:
          feedforward:
            name: ff
          sequence_mix:
            name: causalselfattention
            kwargs:
              num_heads: 4
              use_flash: True

      encoder:
        name: linear
      decoder:
        name: sequence
        kwargs:
          agg_mode: pool
          use_lengths: False

  loss:
    name: crossentropy_sequence
  
  trainer:
    n_steps: 80000 #20000 #20
    val_every: 50
    save_every: 25e3 
    early_stopping_patience: 50000 #250 #1000
    num_workers: 4
    gradient_clip_norm: 10.0 #1.0
    training_strategy:
      enable_mixed_precision: True
      precision: bfloat16
      use_torch_compile: False
    gradient_accumulation_steps: 1 #16
    optimizer:
      name: AdamW
      kwargs:
        lr: 1e-3
        weight_decay: 0.1
        betas: [0.9, 0.99]
    lr_scheduler_step: step
    lr_scheduler: # this scheduler has warumup and is then constant
      name: warmup_cosine_annealing
      kwargs:
        warmup_steps: 100
        decay_until_step: 100 #1500 #3000
        min_lr: ${config.trainer.optimizer.kwargs.lr} # same as initial lr

  data: 
    name: scifar10
    stateful_train_dataset: True
    kwargs:
      data_dir: ${oc.env:DATA_DIR}
      mode: grayscale
      tokenize: False
      augment: False
      split:
        train_val_split: 0.9

    dl_kwargs:
      batch_size: 256 #32
      shuffle: True
"""
cfg = OmegaConf.create(config_yaml)
run_command = REPO.create_experiment(cfg, override=True)
print(run_command)

CUDA_VISIBLE_DEVICES=4 python run.py --config-name sCF10-causalselfattention--lr_0.yaml


## Setup Transformer with LSTM

Train a transformer with lstm on sequential cifar.

In [3]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: fat3
  gpu_ids: [4]
  runs_per_gpu: 4
  use_cuda_visible_devices: True
  shuffle_configs: True

seeds: [0,13,7]

# sweep:
#   type: grid
#   axes: 
#   - parameter: model.kwargs.optimizer.lr
#     vals: [1e-3]
#   - parameter: data.dl_kwargs.batch_size
#     vals: [256]
#   - parameter: model.optimizer.weight_decay
#     vals: [0.0, 1e-2, 1e-1]

start_num: 0
config:
  experiment_data:
    entity: fslgroup
    project_name: tsbench
    experiment_tag: 'sCF10-${config.model.kwargs.block.kwargs.sequence_mix.name}'
    experiment_type: lr_${start_num}
    experiment_name: ${config.experiment_data.experiment_tag}--${config.experiment_data.experiment_type}
    experiment_dir: null
    # output_dir: ${oc.env:OUTPUT_DIR}
    experiment_notes: 
    seed: 0
    gpu_id: 0
   
  model:
    name: sequence_transformer
    kwargs:
      num_layers: 3
      embedding_dim: 64
      dropout: 0.0
      bias: True
      block:
        name: prenorm_block
        kwargs:
          feedforward:
            name: ff
          sequence_mix:
            name: lstm
            kwargs:

      encoder:
        name: linear
      decoder:
        name: sequence
        kwargs:
          agg_mode: pool
          use_lengths: False

  loss:
    name: crossentropy_sequence
            
  trainer:
    n_steps: 80000 #20000 #20
    val_every: 50
    save_every: 25e3 
    early_stopping_patience: 50000 #250 #1000
    num_workers: 4
    gradient_clip_norm: 10.0 #1.0
    training_strategy:
      enable_mixed_precision: True
      precision: float32 # TODO float16 not working
      use_torch_compile: False
    gradient_accumulation_steps: 1 #16
    optimizer:
      name: AdamW
      kwargs:
        lr: 1e-3
        weight_decay: 0.1
        betas: [0.9, 0.99]
    lr_scheduler_step: step
    lr_scheduler: # this scheduler has warumup and is then constant
      name: warmup_cosine_annealing
      kwargs:
        warmup_steps: 100
        decay_until_step: 100 #1500 #3000
        min_lr: ${config.trainer.optimizer.kwargs.lr} # same as initial lr

  data: 
    name: scifar10
    stateful_train_dataset: True
    kwargs:
      data_dir: ${oc.env:DATA_DIR}
      mode: grayscale
      tokenize: False
      augment: False
      split:
        train_val_split: 0.9

    dl_kwargs:
      batch_size: 256 #32
      shuffle: True
"""
cfg = OmegaConf.create(config_yaml)
run_command = REPO.create_experiment(cfg, override=True)
print(run_command)

CUDA_VISIBLE_DEVICES=4 python run.py --config-name sCF10-lstm--lr_0.yaml


## Setup Multilayer LSTM

Train a multilayer lstm on sequential cifar.

In [4]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: fat3
  gpu_ids: [4]
  runs_per_gpu: 4
  use_cuda_visible_devices: True
  shuffle_configs: True

seeds: [0,13,7]

# sweep:
#   type: grid
#   axes: 
#   - parameter: model.kwargs.optimizer.lr
#     vals: [1e-3]
#   - parameter: data.dl_kwargs.batch_size
#     vals: [256]
#   - parameter: model.optimizer.weight_decay
#     vals: [0.0, 1e-2, 1e-1]

start_num: 0
config:
  experiment_data:
    entity: fslgroup
    project_name: tsbench
    experiment_tag: 'sCF10-${config.model.name}'
    experiment_type: lr_${start_num}
    experiment_name: ${config.experiment_data.experiment_tag}--${config.experiment_data.experiment_type}
    experiment_dir: null
    # output_dir: ${oc.env:OUTPUT_DIR}
    experiment_notes: 
    seed: 0
    gpu_id: 0
   
  model:
    name: sequence_lstmmultilayer
    kwargs:
      num_layers: 3
      embedding_dim: 64
      dropout: 0.0
      bias: True

      encoder:
        name: linear
      decoder:
        name: sequence
        kwargs:
          agg_mode: pool
          use_lengths: False
  
  loss:
    name: crossentropy_sequence          

  trainer:
    n_steps: 80000 #20000 #20
    val_every: 50
    save_every: 25e3 
    early_stopping_patience: 50000 #250 #1000
    num_workers: 4
    gradient_clip_norm: 10.0 #1.0
    training_strategy:
      enable_mixed_precision: True
      precision: float32 # TODO float16 not working
      use_torch_compile: False
    gradient_accumulation_steps: 1 #16
    optimizer:
      name: AdamW
      kwargs:
        lr: 1e-3
        weight_decay: 0.1
        betas: [0.9, 0.99]
    lr_scheduler_step: step
    lr_scheduler: # this scheduler has warumup and is then constant
      name: warmup_cosine_annealing
      kwargs:
        warmup_steps: 100
        decay_until_step: 100 #1500 #3000
        min_lr: ${config.trainer.optimizer.kwargs.lr} # same as initial lr

  data: 
    name: scifar10
    stateful_train_dataset: True
    kwargs:
      data_dir: ${oc.env:DATA_DIR}
      mode: grayscale
      tokenize: False
      augment: False
      split:
        train_val_split: 0.9

    dl_kwargs:
      batch_size: 256 #32
      shuffle: True
"""
cfg = OmegaConf.create(config_yaml)
run_command = REPO.create_experiment(cfg, override=True)
print(run_command)

CUDA_VISIBLE_DEVICES=4 python run.py --config-name sCF10-sequence_lstmmultilayer--lr_0.yaml
