In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

import hydra
from JacobianODE.jacobians.jacobian_utils import initialize_config, make_trajectories, postprocess_data, normalize_data, create_dataloaders, setup_wandb, make_model, log_training_info, train_model
import numpy as np
from omegaconf import OmegaConf
import torch

In [3]:
# FILL THIS IN FOR YOUR OWN USE
save_dir = "/orcd/data/ekmiller/001/eisenaj/JacobianODE/lightning"

## Setup

In [4]:
# load the config
with hydra.initialize(version_base="1.3", config_path="../JacobianODE/jacobians/conf/"):
    cfg = hydra.compose(config_name="config")
# print the config in a nice format


# Print the config in a nice format
print(OmegaConf.to_yaml(cfg))

logger: wandb
training:
  batch_size: 16
  run_number: 0
  lightning:
    _target_: null
    direct: true
    loss_func: mse
    alpha_hal: 0.1
    l2_penalty: 0
    l1_penalty: 0.0
    obs_noise_scale: 0
    final_obs_noise_scale: 0
    y0_noise_scale: 0
    noise_annealing: false
    log_interval: 100
    alpha_teacher_forcing: 1
    teacher_forcing_annealing: true
    gamma_teacher_forcing: 0.999
    teacher_forcing_update_interval: 5
    teacher_forcing_steps: 1
    min_alpha_teacher_forcing: 0
    alpha_validation: 0
    obs_noise_scale_validation: 0
    loss_func_validation: mse
    traj_init_steps_validation: 15
    inner_N_validation: 20
    data_type: null
    jacobianODEint_kwargs:
      traj_init_steps: 15
      inner_path: line
      inner_N: 20
      interp_pts: 4
    gradient_clip_val: 1.0
    gradient_clip_algorithm: norm
    optimizer: AdamW
    optimizer_kwargs:
      lr: 0.0001
      weight_decay: 0.0001
    use_scheduler: true
    min_lr: 1.0e-06
    k_scale: 1
    j

In [5]:
cfg.data.postprocessing.obs_noise = 0.01
cfg.training.lightning.loop_closure_weight = 0.001

In [6]:
cfg.training.logger.save_dir = save_dir

In [7]:
# initial setup
torch.set_float32_matmul_precision('high')
num_gpus = torch.cuda.device_count()
print(f"Number of available GPUs: {num_gpus}")

Number of available GPUs: 1


In [8]:
# initialize the config
cfg = initialize_config(cfg)

## Generate data

In [9]:
np.random.seed(cfg.data.flow.random_state)
torch.random.manual_seed(cfg.data.flow.random_state)
eq, sol, dt = make_trajectories(cfg)

values_raw = sol['values']

# select which solution to use for noise
if cfg.data.data_type == 'wmtask' and cfg.data.trajectory_params.model_to_load != 'final':
        temp_cfg = cfg.copy()
        temp_cfg.data.trajectory_params.model_to_load = 'final'
        _, sol_noise, _ = make_trajectories(temp_cfg)
        raw_values_noise = sol_noise['values']
else:
    raw_values_noise = None

## Postprocess the data and create the dataloaders

In [10]:
# Postprocess data
values = postprocess_data(cfg, values_raw, raw_values_to_use_for_noise=raw_values_noise)
if cfg.data.postprocessing.normalize:
    values, mu, sigma = normalize_data(values)
else:
    mu = 0
    sigma = 1
# Create train and test sets
# train_dataloaders, val_dataloaders, train_dataloader_names, val_dataloader_names, dataloader_dict = create_dataloaders(cfg, values, use_test=False)
train_dataloader, val_dataloader, test_dataloader, trajs = create_dataloaders(cfg, values)

## Set up Weights and Biases

In [11]:
name, project = setup_wandb(cfg, trajs, raw_values_to_use_for_noise=raw_values_noise)

## Make model

In [12]:
# Make model
if 'NeuralODE' in cfg.model.params._target_:
    cfg.model.params.dt = float(dt)

In [13]:
if cfg.training.lightning.use_base_deriv_pt:
    x0 = trajs['train_trajs'].sequence.mean(dim=(0, 1))
else:
    x0 = None

In [14]:
torch.random.manual_seed(cfg.data.flow.random_state + cfg.training.run_number)
if cfg.data.train_test_params.delay_embedding_params.n_delays > 1:
    lit_model = make_model(cfg, dt, eq=None, project=project, mu=mu, sigma=sigma, verbose=True)
else:
    lit_model = make_model(cfg, dt, eq=eq, project=project, x0=x0, mu=mu, sigma=sigma, verbose=True)

In [15]:
# Log training information
log_training_info(train_dataloader, trajs, lit_model, log=None)

Number of training trajectory examples: 30.550k
Number of training trajectory points: 31.200k
Number of training data points: 93.600k
Total number of model parameters: 6578.185k


# Train Model

In [16]:
train_model(cfg, lit_model, train_dataloader, val_dataloader, name, project)

[34m[1mwandb[0m: Currently logged in as: [33madamjeisen[0m ([33mchaotic-consciousness[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


/orcd/data/ekmiller/001/eisenaj/miniforge3/envs/jacobianode/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /orcd/data/ekmiller/001/eisenaj/miniforge3/envs/jaco ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 1 processes
----------------------------------------------------------------------------------------------------

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/orcd/data/ekmiller/001/eisenaj/miniforge3/envs/jacobianode/lib/python3.11/site-packages/lightning/pytorch/core/optimiz

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

0,1
alpha teacher forcing,█▇▇▆▅▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loop_closure train mse,█▂█▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
loop_closure train_loss,█▂█▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
mean val loss,█▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
percent_improvement,█▄▅▄▃▃▂▂▁▃▂▁▂▁▂▁▁▁▁▁▁
total train loss,█▃▆▂▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▂
train jac loss,█▆▄▄▃▄▃▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁
train jac norm,▂▁▃▄▃▄▆▆▇▆▆▅▇▆▇▆▆▆▇▆▇█
train jac r2_score,▁▃▄▅▅▅▆▇▇▇▇▆▇█▇▇██████

0,1
alpha teacher forcing,0.15346
epoch,21.0
loop_closure train mse,0.00056
loop_closure train_loss,0.56256
mean val loss,0.08627
percent_improvement,0.0
total train loss,0.04765
train jac loss,1.6541
train jac norm,20.66783
train jac r2_score,0.96858
