In [1]:
%matplotlib notebook
%load_ext autoreload
%pwd

'/ocean/projects/asc170022p/mtragoza/lung-project/notebooks'

In [2]:
import sys
import pandas as pd
sys.path.append('../../param_search')
import param_search as ps

## Setup experiment

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [3]:
# define a job template and name format
template = '''\
#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --account=bio170034p
#SBATCH --partition=BatComputer
#SBATCH --gres=gpu:rtx6000:1
#SBATCH -x v034
#SBATCH --time=48:00:00
#SBATCH -o %J.stdout
#SBATCH -e %J.stderr
#SBATCH --mail-type=all

# {set_name}

hostname
pwd
module load anaconda3
conda activate /ocean/projects/asc170022p/mtragoza/mambaforge/envs/lung-project
nvidia-smi

python ../../../train.py \\
    --data_root ../../../data/Emory-4DCT \\
    --mesh_radius {mesh_radius} \\
    --interp_radius {interp_radius} \\
    --interp_sigma {interp_sigma} \\
    --output_func {output_func} \\
    --batch_size {batch_size} \\
    --save_prefix {job_name} 

echo Done
'''
name_format = 'train__{set_name}__{mesh_radius}__{interp_radius}__{interp_sigma}__{output_func}__{batch_size}'

In [4]:
param_space = ps.ParamSpace(
    set_name='mesh_radius',
    mesh_radius=[20, 10, 5],
    interp_radius=[20],
    interp_sigma=[10],
    output_func=['exp'],
    batch_size=[4],
) + \
ps.ParamSpace(
    set_name='interp_radius',
    mesh_radius=[20],
    interp_radius=[20, 10, 5],
    interp_sigma=[10],
    output_func=['exp'],
    batch_size=[4],
) + \
ps.ParamSpace(
    set_name='interp_sigma',
    mesh_radius=[20],
    interp_radius=[20],
    interp_sigma=[20, 10, 5],
    output_func=['exp'],
    batch_size=[4], 
) + \
ps.ParamSpace(
    set_name='output_func',
    mesh_radius=[20],
    interp_radius=[20],
    interp_sigma=[10],
    output_func=['exp', 'splus', 'relu', 'id'],
    batch_size=[4],    
) + \
ps.ParamSpace(
    set_name='batch_size',
    mesh_radius=[20],
    interp_radius=[20],
    interp_sigma=[10],
    output_func=['exp'],
    batch_size=[2, 4, 8, 16],
)

for p in param_space:
    print(name_format.format(**p))

print(len(param_space))

train__mesh_radius__20__20__10__exp__4
train__mesh_radius__10__20__10__exp__4
train__mesh_radius__5__20__10__exp__4
train__interp_radius__20__20__10__exp__4
train__interp_radius__20__10__10__exp__4
train__interp_radius__20__5__10__exp__4
train__interp_sigma__20__20__20__exp__4
train__interp_sigma__20__20__10__exp__4
train__interp_sigma__20__20__5__exp__4
train__output_func__20__20__10__exp__4
train__output_func__20__20__10__splus__4
train__output_func__20__20__10__relu__4
train__output_func__20__20__10__id__4
train__batch_size__20__20__10__exp__2
train__batch_size__20__20__10__exp__4
train__batch_size__20__20__10__exp__8
train__batch_size__20__20__10__exp__16
17


## Submit jobs

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [5]:
expt_name = ['2024-09-28__initial'][-1]

In [6]:
do_submit = False
if do_submit:
    jobs = ps.submit(template, name_format, param_space, work_dir=expt_name)
    jobs.to_csv(f'{expt_name}.jobs')

## Monitor jobs

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [7]:
jobs = pd.read_csv(f'{expt_name}.jobs', index_col=0)
status = ps.status(jobs, parse_stderr=True)
status

Unnamed: 0_level_0,index,set_name,mesh_radius,interp_radius,interp_sigma,output_func,batch_size,job_name,partition,job_state,node_id,runtime,work_dir,array_idx,stdout,stderr
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
26046335,0,mesh_radius,20,20,10,exp,4,train__mesh_radius__20__20__10__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26046336,1,mesh_radius,10,20,10,exp,4,train__mesh_radius__10__20__10__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,torch.OutOfMemoryError: CUDA out of memory. Tr...
26046337,2,mesh_radius,5,20,10,exp,4,train__mesh_radius__5__20__10__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,torch.OutOfMemoryError: CUDA out of memory. Tr...
26046338,3,interp_radius,20,20,10,exp,4,train__interp_radius__20__20__10__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26046339,4,interp_radius,20,10,10,exp,4,train__interp_radius__20__10__10__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26046340,5,interp_radius,20,5,10,exp,4,train__interp_radius__20__5__10__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26046341,6,interp_sigma,20,20,20,exp,4,train__interp_sigma__20__20__20__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26046342,7,interp_sigma,20,20,10,exp,4,train__interp_sigma__20__20__10__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26046343,8,interp_sigma,20,20,5,exp,4,train__interp_sigma__20__20__5__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26046344,9,output_func,20,20,10,exp,4,train__output_func__20__20__10__exp__4,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,


In [9]:
status['job_state'] = status['job_state'].fillna('DONE')
status['stderr'] = status['stderr'].fillna('N/A')
status.groupby(['job_state', 'set_name', 'mesh_radius', 'batch_size', 'stderr'])[['job_name']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,job_name
job_state,set_name,mesh_radius,batch_size,stderr,Unnamed: 5_level_1
DONE,batch_size,20,2,,1
DONE,batch_size,20,4,,1
DONE,batch_size,20,8,,1
DONE,batch_size,20,16,"torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 47.54 GiB of which 1.12 MiB is free. Including non-PyTorch memory, this process has 47.53 GiB memory in use. Of the allocated memory 47.18 GiB is allocated by PyTorch, and 42.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",1
DONE,interp_radius,20,4,,3
DONE,interp_sigma,20,4,,3
DONE,mesh_radius,5,4,"torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 47.54 GiB of which 5.12 MiB is free. Including non-PyTorch memory, this process has 47.52 GiB memory in use. Of the allocated memory 47.10 GiB is allocated by PyTorch, and 120.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",1
DONE,mesh_radius,10,4,"torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 47.54 GiB of which 13.12 MiB is free. Including non-PyTorch memory, this process has 47.51 GiB memory in use. Of the allocated memory 47.11 GiB is allocated by PyTorch, and 96.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",1
DONE,mesh_radius,20,4,,1
DONE,output_func,20,4,,4


In [10]:
print(status.iloc[0].stderr)




## Analyze results

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [13]:
m = ps.metrics(jobs, sep=',')
m

train__mesh_radius__10__20__10__exp__4 No objects to concatenate
train__mesh_radius__5__20__10__exp__4 No objects to concatenate
train__batch_size__20__20__10__exp__16 No objects to concatenate


Unnamed: 0,set_name,mesh_radius,interp_radius,interp_sigma,output_func,batch_size,job_name,job_id,partition,job_state,...,phase,rep,loss,u_pred_norm,u_true_norm,mu_pred_norm,mu_anat_corr,mu_950_corr,mu_900_corr,mu_850_corr
0,mesh_radius,20,20,10,exp,4,train__mesh_radius__20__20__10__exp__4,26046335,BatComputer,PENDING,...,train,dofs,0.455107,0.499079,0.494619,258.193688,0.134226,,,
1,mesh_radius,20,20,10,exp,4,train__mesh_radius__20__20__10__exp__4,26046335,BatComputer,PENDING,...,train,dofs,0.285941,0.226873,0.241047,227.318826,0.176998,,,
2,mesh_radius,20,20,10,exp,4,train__mesh_radius__20__20__10__exp__4,26046335,BatComputer,PENDING,...,train,dofs,1.029229,0.284123,0.285008,221.187263,0.218370,,,
3,mesh_radius,20,20,10,exp,4,train__mesh_radius__20__20__10__exp__4,26046335,BatComputer,PENDING,...,train,dofs,0.119984,0.749092,0.803618,223.452481,-0.030528,,,
4,mesh_radius,20,20,10,exp,4,train__mesh_radius__20__20__10__exp__4,26046335,BatComputer,PENDING,...,train,dofs,0.111630,1.068400,1.130876,218.714569,0.133545,,,0.075494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128795,batch_size,20,20,10,exp,8,train__batch_size__20__20__10__exp__8,26046350,BatComputer,PENDING,...,train,dofs,0.299818,0.508015,0.555034,162.175417,0.322562,,,
128796,batch_size,20,20,10,exp,8,train__batch_size__20__20__10__exp__8,26046350,BatComputer,PENDING,...,train,dofs,0.413335,0.522164,0.564462,140.331727,0.397820,,,
128797,batch_size,20,20,10,exp,8,train__batch_size__20__20__10__exp__8,26046350,BatComputer,PENDING,...,train,dofs,1.003933,0.328874,0.338564,135.717830,0.315501,,,
128798,batch_size,20,20,10,exp,8,train__batch_size__20__20__10__exp__8,26046350,BatComputer,PENDING,...,test,dofs,0.170711,1.168170,1.227881,119.248306,0.161223,,,


In [15]:
d = m.groupby(['job_name'])[['epoch']].max()
d

Unnamed: 0_level_0,epoch
job_name,Unnamed: 1_level_1
train__batch_size__20__20__10__exp__2,100
train__batch_size__20__20__10__exp__4,100
train__batch_size__20__20__10__exp__8,100
train__interp_radius__20__10__10__exp__4,100
train__interp_radius__20__20__10__exp__4,100
train__interp_radius__20__5__10__exp__4,100
train__interp_sigma__20__20__10__exp__4,100
train__interp_sigma__20__20__20__exp__4,100
train__interp_sigma__20__20__5__exp__4,100
train__mesh_radius__20__20__10__exp__4,100


In [16]:
unfinished_jobs = d[d.epoch < 100]
unfinished_jobs

Unnamed: 0_level_0,epoch
job_name,Unnamed: 1_level_1


In [None]:
%autoreload
fig = ps.plot(
    m[(m.phase == 'train') & (m.epoch > 99)],
    x=['image_size', 'n_nodes', 'batch_size'],
    y=['u_loss', 'mu_loss'],
    legend=True,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True
)

In [None]:
%autoreload
fig = ps.plot(
    m[(m.phase == 'train') & (m.epoch > 99) & (m.batch_size == 64)],
    x=['image_size', 'n_nodes'],
    y=['u_loss', 'mu_loss'],
    legend=True,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True
)

In [None]:
%autoreload
m['nodes_per_pixel'] = m['n_nodes'] / m['image_size']
fig = ps.plot(
    m[(m.phase == 'train') & (m.epoch > 90)],
    x=['nodes_per_pixel', 'batch_size'],
    y=['u_loss', 'mu_loss'],
    legend=True,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True
)

In [None]:
m[(m.phase == 'train') & (m.epoch > 90)].groupby(['n_nodes', 'image_size', 'nodes_per_pixel', 'batch_size'])[['u_loss', 'mu_loss']].mean()

In [None]:
m.columns

In [None]:
%autoreload
fig = ps.plot(
    m[(m.phase == 'train')],
    x=['image_size', 'n_nodes', 'batch_size'],
    y=['t_model', 't_loss', 't_grad', 't_optim'],
    legend=True,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True
)