In [1]:
%matplotlib notebook
%load_ext autoreload
%pwd

'/ocean/projects/asc170022p/mtragoza/lung-project/notebooks'

In [3]:
import sys
import pandas as pd
sys.path.append('../../param_search')
import param_search as ps

## Setup experiment

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [4]:
# define a job template and name format
template = '''\
#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --account=bio170034p
#SBATCH --partition=BatComputer
#SBATCH --gres=gpu:rtx6000:1
#SBATCH -x v034
#SBATCH --time=48:00:00
#SBATCH -o %J.stdout
#SBATCH -e %J.stderr
#SBATCH --mail-type=all

hostname
pwd
module load anaconda3
conda activate /ocean/projects/asc170022p/mtragoza/mambaforge/envs/lung-project
nvidia-smi

python ../../../train.py \\
    --data_name phantom \\
    --data_root ../../../data/phantom_lung3_{anat_range}_{dummy_targets:d} \\
    --rho_value 0 \\
    --conv_channels {conv_channels} \\
    --output_func {output_func} \\
    --save_prefix {job_name} 

echo Done
'''
name_format = 'train__phantom__{anat_range}__{dummy_targets:d}__{conv_channels}__{output_func}'

In [23]:
param_space = ps.ParamSpace(
    anat_range=[250],
    dummy_targets=[False, True],
    conv_channels=[8, 16, 32],
    output_func=['relu', 'exp'],
)

param_space = ps.ParamSpace(
    anat_range=[250, 125, 0],
    dummy_targets=[False, True],
    conv_channels=[32],
    output_func=['relu'],
)

for p in param_space:
    print(name_format.format(**p))

print(len(param_space))

train__phantom__250__0__32__relu
train__phantom__250__1__32__relu
train__phantom__125__0__32__relu
train__phantom__125__1__32__relu
train__phantom__0__0__32__relu
train__phantom__0__1__32__relu
6


## Submit jobs

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [24]:
expt_names = [
    '2024-11-22__phantom__250',
    '2024-11-22__phantom__250__resub',
    '2024-11-30__phantom__relu'
]
expt_name = expt_names[-1]
expt_name

'2024-11-30__phantom__relu'

In [25]:
do_submit = True
if do_submit:
    jobs = ps.submit(template, name_format, param_space, work_dir=expt_name)
    jobs.to_csv(f'{expt_name}.jobs')

100%|██████████| 6/6 [00:00<00:00, 83.04it/s]


  .replace('', float('nan')).map(pd.to_numeric)


## Monitor jobs

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [11]:
jobs = []
for expt_name in expt_names:
    j = pd.read_csv(f'{expt_name}.jobs', index_col=0)
    j['expt_name'] = expt_name
    jobs.append(j)
jobs = pd.concat(jobs)
status = ps.status(jobs, parse_stderr=True)
status

Unnamed: 0_level_0,index,anat_range,dummy_targets,conv_channels,output_func,job_name,partition,job_state,node_id,runtime,work_dir,array_idx,expt_name,stdout,stderr
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
26914456,0,250,False,8,relu,train__phantom__250__0__8__relu,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26914457,1,250,False,8,exp,train__phantom__250__0__8__exp,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26914458,2,250,False,16,relu,train__phantom__250__0__16__relu,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26914459,3,250,False,16,exp,train__phantom__250__0__16__exp,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,RuntimeError: Destination file should exist at...
26914460,4,250,False,32,relu,train__phantom__250__0__32__relu,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,RuntimeError: Destination file should exist at...
26914461,5,250,False,32,exp,train__phantom__250__0__32__exp,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26914462,6,250,True,8,relu,train__phantom__250__1__8__relu,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26914463,7,250,True,8,exp,train__phantom__250__1__8__exp,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26914464,8,250,True,16,relu,train__phantom__250__1__16__relu,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,
26914465,9,250,True,16,exp,train__phantom__250__1__16__exp,BatComputer,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,2024-11-22__phantom__250,dv004.ib.bridges2.psc.edu\n/ocean/projects/asc...,


In [12]:
status['job_state'] = status['job_state'].fillna('DONE')
status['stderr'] = status['stderr'].fillna('N/A')
status.groupby(['job_state', 'stderr'])[['job_name']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,job_name
job_state,stderr,Unnamed: 2_level_1
DONE,,12
DONE,RuntimeError: Destination file should exist at this point!,2


In [67]:
error_jobs = status[status['stderr'] != ''].reset_index()[list(param_space.keys()) + ['job_name', 'work_dir']]
error_jobs

Unnamed: 0,anat_range,dummy_targets,conv_channels,output_func,job_name,work_dir
0,250,False,16,exp,train__phantom__250__0__16__exp,/ocean/projects/asc170022p/mtragoza/lung-proje...
1,250,False,32,relu,train__phantom__250__0__32__relu,/ocean/projects/asc170022p/mtragoza/lung-proje...


## Analyze results

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [13]:
m = ps.metrics(jobs, sep=',')
m

Unnamed: 0,anat_range,dummy_targets,conv_channels,output_func,job_name,job_id,partition,job_state,node_id,runtime,...,CTE,e_true_corr,e_anat_corr,true_anat_corr,e_950_corr,e_900_corr,e_850_corr,true_950_corr,true_900_corr,true_850_corr
0,250,False,8,relu,train__phantom__250__0__8__relu,26914456,BatComputer,PENDING,(None),0:00,...,0.473760,0.099661,0.094319,0.914121,-0.007062,-0.007062,-0.007062,-0.149265,-0.149265,-0.149265
1,250,False,8,relu,train__phantom__250__0__8__relu,26914456,BatComputer,PENDING,(None),0:00,...,0.064161,0.004093,0.005775,0.662333,0.009496,0.009496,0.009496,-0.638444,-0.638444,-0.638444
2,250,False,8,relu,train__phantom__250__0__8__relu,26914456,BatComputer,PENDING,(None),0:00,...,0.219511,0.142122,0.518732,0.166048,-0.164889,-0.340262,-0.417392,-0.051188,-0.114213,-0.176516
3,250,False,8,relu,train__phantom__250__0__8__relu,26914456,BatComputer,PENDING,(None),0:00,...,0.260782,-0.081808,0.171183,0.243648,,,,,,
4,250,False,8,relu,train__phantom__250__0__8__relu,26914456,BatComputer,PENDING,(None),0:00,...,0.270263,-0.118556,0.056751,0.468907,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331231,250,False,32,relu,train__phantom__250__0__32__relu,27071905,BatComputer,PENDING,(None),0:00,...,0.783187,0.064159,0.065445,0.876121,,,,,,
331232,250,False,32,relu,train__phantom__250__0__32__relu,27071905,BatComputer,PENDING,(None),0:00,...,0.651761,0.057217,0.051532,0.263599,,,,,,
331233,250,False,32,relu,train__phantom__250__0__32__relu,27071905,BatComputer,PENDING,(None),0:00,...,0.602666,0.680370,0.344907,0.394150,-0.034725,-0.081183,-0.171760,-0.036759,-0.081035,-0.161773
331234,250,False,32,relu,train__phantom__250__0__32__relu,27071905,BatComputer,PENDING,(None),0:00,...,0.785567,0.082583,0.073594,0.916334,,,,,,


In [14]:
d = m.groupby(['job_name'])[['epoch']].max()
d

Unnamed: 0_level_0,epoch
job_name,Unnamed: 1_level_1
train__phantom__250__0__16__exp,200
train__phantom__250__0__16__relu,200
train__phantom__250__0__32__exp,200
train__phantom__250__0__32__relu,200
train__phantom__250__0__8__exp,200
train__phantom__250__0__8__relu,200
train__phantom__250__1__16__exp,200
train__phantom__250__1__16__relu,200
train__phantom__250__1__32__exp,200
train__phantom__250__1__32__relu,200


In [16]:
unfinished_jobs = d[d.epoch < 200]
unfinished_jobs

Unnamed: 0_level_0,epoch
job_name,Unnamed: 1_level_1


In [17]:
m.columns

Index(['anat_range', 'dummy_targets', 'conv_channels', 'output_func',
       'job_name', 'job_id', 'partition', 'job_state', 'node_id', 'runtime',
       'work_dir', 'array_idx', 'expt_name', 'epoch', 'batch', 'example',
       'phase', 'rep', 'u_error', 'u_pred_norm', 'u_true_norm', 'e_error',
       'e_pred_norm', 'e_true_norm', 'CTE', 'e_true_corr', 'e_anat_corr',
       'true_anat_corr', 'e_950_corr', 'e_900_corr', 'e_850_corr',
       'true_950_corr', 'true_900_corr', 'true_850_corr'],
      dtype='object')

In [21]:
%autoreload
fig = ps.plot(
    m[(m.phase == 'train') & (m.epoch > 100) & (m.conv_channels > 8)],
    x=['dummy_targets', 'conv_channels', 'output_func'],
    y=['u_error', 'e_error', 'e_anat_corr', 'CTE'],
    hue=None,
    legend=True,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True,
    height=2.25, width=2.75
)

<IPython.core.display.Javascript object>

In [22]:
%autoreload
fig = ps.plot(
    m[(m.phase == 'test') & (m.epoch > 100) & (m.conv_channels > 8)],
    x=['dummy_targets', 'conv_channels', 'output_func'],
    y=['u_error', 'e_error', 'e_anat_corr', 'CTE'],
    hue=None,
    legend=True,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True,
    height=2.25, width=2.75
)

<IPython.core.display.Javascript object>

In [84]:
m[(m.phase == 'test') & (m.epoch > 190) & (m.conv_channels > 8)].groupby(['dummy_targets', 'conv_channels', 'output_func', 'phase'])[['u_error', 'e_error', 'e_anat_corr', 'CTE']].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,u_error,u_error,u_error,u_error,u_error,u_error,u_error,u_error,e_error,e_error,...,e_anat_corr,e_anat_corr,CTE,CTE,CTE,CTE,CTE,CTE,CTE,CTE
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
dummy_targets,conv_channels,output_func,phase,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
False,16,relu,test,20.0,0.001592,0.001617,0.000198,0.000393,0.000782,0.00218,0.005494,20.0,1.333001,...,0.182263,0.326268,20.0,0.523165,0.23024,0.18297,0.294335,0.580904,0.738801,0.787861
False,32,exp,test,20.0,0.001662,0.001393,0.000164,0.000524,0.001012,0.002517,0.004392,20.0,8.668284,...,0.274917,0.385551,20.0,0.585586,0.221956,0.206835,0.3486,0.661662,0.793236,0.828707
True,16,exp,test,20.0,0.002067,0.001691,0.000492,0.00096,0.001384,0.002558,0.006746,20.0,23.971448,...,0.206008,0.304233,20.0,0.59784,0.174649,0.294312,0.427728,0.648954,0.753921,0.827736
True,16,relu,test,20.0,0.002884,0.002647,0.00047,0.001196,0.001697,0.004714,0.009315,20.0,1.075325,...,0.164611,0.256176,20.0,0.49904,0.221738,0.146012,0.29876,0.494041,0.73662,0.776693
True,32,exp,test,20.0,0.00242,0.001993,0.000418,0.001109,0.001761,0.002808,0.006742,20.0,7.60274,...,0.23833,0.359781,20.0,0.620617,0.18944,0.214952,0.497774,0.705064,0.775577,0.827599
True,32,relu,test,20.0,0.00279,0.002211,0.00031,0.001285,0.002258,0.003387,0.007604,20.0,1.393699,...,0.209315,0.284903,20.0,0.546983,0.206228,0.221254,0.358529,0.567576,0.750819,0.813488
