# SUBMISSION NOTEBOOK

## SETUP

#### imports 

In [1]:
import pandas as pd
import os
import datetime
import subprocess
import re
from omegaconf import OmegaConf
import pathlib
from tqdm.auto import tqdm
import itertools

In [2]:
import wandb

#### CONFIG:

In [3]:
# USER = 'YOURUSERNAME'
# CODE_BASE = f'/home/buergelt/projects/cardiors/code/RetinalRisk'
CODE_BASE = '/sc-projects/sc-proj-ukb-cvd/code/RetinalRisk'
SUBMISSION_BASE = '/sc-projects/sc-proj-ukb-cvd/submissions/RetinalRisk'
# SUBMISSION_BASE = f'/home/buergelt/projects/cardiors/code/RetinalRisk'

TAG = 230426
JOBNAME = f'ret_preds'

EXPERIMENT_NAME = f'22_retinalrisk_{TAG}_{JOBNAME}'   # name under which to store the generated .sh scripts and yamls
TEMPLATE_CONFIG = f'{CODE_BASE}/config/'   # template yaml to use
TRAIN_SCRIPT = f'{CODE_BASE}/retinalrisk/scripts/predict_retina.py'     # python train script to use

# os.makedirs(f'/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions', exist_ok=True)
# os.makedirs(f'/home/{USER}/tmp/{EXPERIMENT_NAME}/job_configs', exist_ok=True)

os.makedirs(f'{SUBMISSION_BASE}/{EXPERIMENT_NAME}/job_submissions', exist_ok=True)
os.makedirs(f'{SUBMISSION_BASE}/{EXPERIMENT_NAME}/job_configs', exist_ok=True)
os.makedirs(f'{SUBMISSION_BASE}/{EXPERIMENT_NAME}/job_outputs', exist_ok=True)

In [4]:
#partitions = [i for i in range(0, 5)] # CHRISTINA
#partitions =  [i for i in range(5, 10)] # PAUL 
#partitions =  [i for i in range(10, 16)] # THORE 
#partitions = [i for i in range(16, 22)] # LUKAS 


#### get wandb runs

In [5]:
api = wandb.Api()
entity, project = "cardiors", "retina"
runs = api.runs(entity + "/" + project) 

In [6]:
run_list = []
for run in tqdm(runs): 
    if TAG in run.tags:
        run_list.append(
            {
                "id": run.id, #path[-1]
                "name": run.name,
                "tags": run.tags,
                "config": run.config, #{k: v for k,v in run.config.items() if not k.startswith('_')},
                "partition": eval(run.config['_content']['datamodule'])['partition'],
                "summary": run.summary._json_dict,
                "path": None if "predictions_path" not in run.config.keys() else str(pathlib.Path(run.config["predictions_path"]))
    #             'path': f'/sc-projects/sc-proj-ukb-cvd/results/models/retina/{run.id}/checkpoints/predictions/predictions.feather'
            }
        )

  0%|          | 0/215 [00:00<?, ?it/s]

In [7]:
runs_df = pd.DataFrame(run_list)

In [8]:
runs_df

Unnamed: 0,id,name,tags,config,partition,summary,path
0,ljhjndx2,220812_fullrun,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,4,{'valid/phecode_526-11 - Intestinal e-coli_CIn...,
1,1ts15g03,220812_fullrun,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,9,{'gradients/encoder.features.5.19.block.0.bias...,
2,jtx4az09,220812_fullrun_15,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,15,{'valid/phecode_703-11 - Gout_CIndex': 0.69062...,
3,10r747tq,220812_fullrun_3,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,3,{'valid/phecode_709-2 - Acquired deformities o...,
4,3kkkwx1h,220812_fullrun_8,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,8,{'valid/phecode_138 - Benign neoplasm of the s...,
5,kr1fcpov,220812_fullrun_7,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,7,{'gradients/encoder.features.0.0.bias': {'bins...,
6,2i64fvxa,220812_fullrun_2,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,2,{'valid/phecode_375 - Abnormal intraocular pre...,
7,2lsliaj8,220812_fullrun_21,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,21,{'valid/phecode_526-12 - Intestinal infection ...,
8,9e6xmhpu,220812_fullrun_14,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,14,{'gradients/encoder.features.5.11.layer_scale'...,
9,227i0u0y,220812_fullrun_13,"[220812, baseline_data, image]",{'losses': ['<retinalrisk.models.loss_wrapper....,13,{'valid/phecode_410-2 - Endocarditis_CIndex': ...,


In [9]:
runs_df = runs_df.query('partition in @partitions')

runs_df.partition.unique()

array([15, 14, 13, 12, 11, 10])

In [10]:
BASE_HYPERPARAMS = [
 f'setup.name={TAG}_{JOBNAME}',
    "training.gradient_checkpointing=False",
    "training.patience=40",
    "datamodule/covariates=no_covariates",
    "model=image",
    "setup.use_data_artifact_if_available=False",
    "head=mlp",
    "head.kwargs.num_hidden=512",
    "head.kwargs.num_layers=2",
    "head.dropout=0.5",
    "training.optimizer_kwargs.weight_decay=0.001",
    "training.optimizer_kwargs.lr=0.0001",
    "model.freeze_encoder=False",
    "model.encoder=convnext_small",
    "datamodule.batch_size=100",
    "training.warmup_period=8",
    "datamodule/augmentation=contrast_sharpness_posterize",
    "datamodule.img_size_to_gpu=420",
    "datamodule.num_workers=16",
    "model.pretrained=True",
 ]

In [11]:
# in contrast to the job submission script these parameters will NOT be run through in a nested for-loop!
# instead, we through them simultaneously!

parameters = {
    'setup.restore_id': runs_df.id.values.tolist(), 
    'datamodule.partition': runs_df.partition.values.tolist(),
}

#### Functions

In [12]:
#SBATCH --gres=gpu:nvidia_a100-sxm-80gb:1           # Generic resources; 1 GPU
def make_job_script(job_name, hyperparams):
    
    params_str = ' '.join(BASE_HYPERPARAMS + hyperparams)

    job_script_str = f'''#!/bin/bash
#SBATCH --job-name={job_name}                # Specify job name
#SBATCH --partition=pgpu                     # Specify partition name
#SBATCH --nodes=1-1                          # Specify number of nodes
#SBATCH --cpus-per-gpu=32
#SBATCH --mem=200GB                          # Use entire memory of node
#SBATCH --gres=gpu:nvidia_a100-sxm4-80gb:1   # Generic resources; 1 80GB GPU
#SBATCH --time=50:00:00                      # Set a limit on the total run time
#SBATCH --error={SUBMISSION_BASE}/{EXPERIMENT_NAME}/job_outputs/slurm-%A_%a.err
#SBATCH --output={SUBMISSION_BASE}/{EXPERIMENT_NAME}/job_outputs/slurm-%A_%a.out

source ~/miniconda3/etc/profile.d/conda.sh
conda activate /sc-projects/sc-proj-ukb-cvd/environments/retina

python {TRAIN_SCRIPT} --config-path {TEMPLATE_CONFIG} ''' + params_str
    
    return job_script_str

In [13]:
def submit(path, job_name, job_script, time_stamp=None):
    if not time_stamp:
        time_stamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
        
    script_path_long = f'{path}/{job_name}_{time_stamp}.sh'

    with open(script_path_long, 'w') as outfile: 
        outfile.write(job_script)
    script_path = f'{path}/{job_name}.sh'
    try:
        os.unlink(script_path)
    except FileNotFoundError: # because we cannot overwrite symlinks directly
        pass
    os.symlink(os.path.realpath(script_path_long), script_path)

    print('\n\nSubmission:\n===========\n')
    sub_cmd = f'sbatch < {script_path}'
    print(sub_cmd)
    
    ret = subprocess.run(sub_cmd, shell=True, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(ret.stdout.decode())

## RUN

In [14]:
jobids = []

In [15]:
for i, hp_vals in enumerate(zip(*parameters.values())):
    hyperparams = [f"{p}={v}" for p, v in zip(parameters.keys(), hp_vals)]
    
    job_script = make_job_script(job_name=JOBNAME,
                                 hyperparams=hyperparams)
    print(job_script)

    jobid = submit(path=f"{SUBMISSION_BASE}/{EXPERIMENT_NAME}/job_submissions",
#     jobid = submit(path=f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions",
                   job_name=JOBNAME+f'_{i}',
                   job_script=job_script)

    jobids.append(jobid)

#!/bin/bash
#SBATCH --job-name=write_preds                # Specify job name
#SBATCH --partition=pgpu                     # Specify partition name
#SBATCH --nodes=1-1                          # Specify number of nodes
#SBATCH --cpus-per-gpu=32
#SBATCH --mem=200GB                          # Use entire memory of node
#SBATCH --gres=gpu:nvidia_a100-sxm4-80gb:1   # Generic resources; 1 80GB GPU
#SBATCH --time=50:00:00                      # Set a limit on the total run time
#SBATCH --error=/sc-projects/sc-proj-ukb-cvd/submissions/RetinalRisk/22_retinalrisk_220812_write_preds/job_outputs/slurm-%A_%a.err
#SBATCH --output=/sc-projects/sc-proj-ukb-cvd/submissions/RetinalRisk/22_retinalrisk_220812_write_preds/job_outputs/slurm-%A_%a.out

source ~/miniconda3/etc/profile.d/conda.sh
conda activate /sc-projects/sc-proj-ukb-cvd/environments/retina

python /sc-projects/sc-proj-ukb-cvd/code/RetinalRisk/retinalrisk/scripts/predict_retina.py --config-path /sc-projects/sc-proj-ukb-cvd/code/RetinalRisk/co

In [16]:
print(jobids)

[None, None, None, None, None, None]
