In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Run VAE models systematically

## Imports

In [None]:
import pandas as pd
import numpy as np


## Create table of all VAE model training settings

Parameters for:
- Biological dataset generation
- Training data
    - Input
    - Output 
- Model architecture
- Training hyperparameters

### Initial parameters

In [None]:
hpos_architecture = {
    'seed_arch': 1,
    'hidden_size': 32,
    'enc_layers': [64, 64, 64],
    'dec_layers': [64, 64, 64],
    'model': 'CVAE'
}

hpos_training = {
    'batch_size': 128,
    'epochs': 1500,
    'learning_rate': 1e-2,
    'learning_rate_sched': 'cosine_decay',
    'use_dropout': False,
    'use_l2_reg': False,
    'use_warmup': True,
    'warmup_epochs': 20,
    'l2_reg_alpha': 0.01,
    'print_every': 15,
    'loss_func': 'mse',
}

hpos_optimization = {
    'seed_opt': 1,
    'opt_method': 'adam',
    'opt_metric': 'mean_absolute_error',
    'opt_mode': 'min',
    'opt_patience': 100,
    'opt_factor': 0.5,
    'opt_min_lr': 1e-6,
    'opt_min_delta': 1e-4
}

hpos_dataset = {
    'seed_dataset': 1,
    'include_diffs': False,
    'objective_col': 'adaptability',
    'output_species': ['RNA_2'],
    # 'total_ds': None,   # TO BE RECORDED
    'total_ds_max': 3e6,
    'train_split': 0.8,
    'x_type': 'energies',
    # XY transforms:
    'rem_x_nans': True,
    'rem_y_nans': True,
    'rem_sensitivity_nans': True,
    'rem_precision_nans': True,
    'scale_x_minmax': True,
    'scale_y_minmax': True,
    'use_x_logscale': False,
    'use_y_logscale': False,
    'use_y_categorical': False,
    'use_x_neg': True
}

hpos_biological = {
    'filenames_train_config': ['EvoScaper/data/raw/summarise_simulation/2024_12_05_210221/ensemble_config.json'], 
    'filenames_train_table': ['EvoScaper/data/raw/summarise_simulation/2024_12_05_210221/tabulated_mutation_info.csv'],
    'filenames_verify_config': ['EvoScaper/data/raw/summarise_simulation/2024_11_21_160955/ensemble_config.json'], 
    'filenames_verify_table': ['EvoScaper/data/raw/summarise_simulation/2024_11_21_160955/tabulated_mutation_info.csv'],
    'n_species': 3,
    'sequence_length': 20,
    'signal_function': 'step_function',
    'signal_target': 2,
    'starting_copynumbers_input': [200],
    'starting_copynumbers_output': [200],
    'starting_copynumbers_other': [200],
    'association_binding_rate': 1000000,
    'include_prod_deg': False,
}

info_to_be_recorded = {
    'filename_saved_model': 'TO_BE_RECORDED',
    'total_ds': 'TO_BE_RECORDED',
    'n_batches': 'TO_BE_RECORDED',
    'R2_train': 'TO_BE_RECORDED',
    'R2_test': 'TO_BE_RECORDED',
    'conditionality_fidelity': 'TO_BE_RECORDED',
}



In [None]:
df_hpos = pd.concat([pd.DataFrame.from_dict(hpos, orient='index').T for hpos in [hpos_architecture, hpos_training, hpos_optimization, hpos_dataset]], axis=1)
assert df_hpos.columns.duplicated().sum() == 0, 'Change some column names, there are duplicates'
df_hpos

Unnamed: 0,seed_arch,hidden_size,enc_layers,dec_layers,model,batch_size,epochs,learning_rate,learning_rate_sched,use_dropout,...,total_ds,total_ds_max,train_split,x_type,scale_x_minmax,scale_y_minmax,use_x_logscale,use_y_logscale,use_y_categorical,use_x_neg
0,1,32,"[64, 64, 64]","[64, 64, 64]",CVAE,128,1500,0.01,cosine_decay,False,...,,3000000.0,0.8,energies,True,True,False,False,False,True


### All parameters

In [None]:
hpos_to_vary = {
    'hidden_size': [32, 64, 128, 256, 512],
    'objective_col': ['adaptability', 'sensitivity_wrt_species-6'],
    'total_ds_max': [1e4, 5e4, 1e5, 5e5, 1e6, 5e6],
    'x_type': ['energies', 'binding_rates_dissociation'],
    
}

# Use table to create dataset for training