# Benchmark Models

## Initialize

In [1]:
%load_ext autoreload
%autoreload 2

import os
from tqdm.auto import tqdm
import pathlib
import datetime
import subprocess
import numpy as np
import pandas as pd
import lifelines



In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '221108'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)


USER = 'buergelt'
BASE = pathlib.Path(f"/home/{USER}/")
EXPERIMENT_NAME = '221109'
#TEMPLATE_CONFIG = f"{BASE}/config/"  # template yaml to use
TRAIN_SCRIPT = f"{BASE}/projects/cardiors/code/22_retina_phewas_evaluation/1_processing/10_benchmarks_iteration_CVD.py"
#TRAIN_SCRIPT = f"{BASE}/riskiano/riskiano/experiments/lukas/phewas/22_retina_phewas_notebooks/1_processing/08_coxph_fit_partition.py"
ACTIVATE_ENV_CMD = """conda activate retrisk"""

TAG = '221109'
JOBNAME = 'benchmark'


name_dict = {
    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [3]:
today = '221109'

## Submit Benchmark jobs

In [4]:
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_configs", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs", exist_ok=True)

In [5]:
f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs"

'/home/buergelt/tmp/221109/job_outputs'

In [6]:
def make_job_script(user, job_name, iteration, model, partition):

    job_script_str = (
        f"""#!/bin/bash

#SBATCH --job-name={job_name}  # Specify job name
#SBATCH --ntasks 1 
#SBATCH --cpus-per-task 16
#SBATCH --mem=32G              # Specify number of nodes
#SBATCH --time=2:30:00        # Set a limit on the total run time

source ~/miniconda3/etc/profile.d/conda.sh
{ACTIVATE_ENV_CMD}

# ray start --head --num-cpus 16
python {TRAIN_SCRIPT} --iteration {iteration} --model {model} --partition {partition}"""
            )

    return job_script_str


In [7]:
def submit(path, job_name, job_script, time_stamp=None):
    if not time_stamp:
        time_stamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    script_path_long = f"{path}/{job_name}_{time_stamp}.sh"

    with open(script_path_long, "w") as outfile:
        outfile.write(job_script)
    script_path = f"{path}/{job_name}.sh"
    try:
        os.unlink(script_path)
    except FileNotFoundError:  # because we cannot overwrite symlinks directly
        pass
    os.symlink(os.path.realpath(script_path_long), script_path)

    output_path = f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs/{job_name}"

    print(job_script)
    print("\n\nSubmission:\n===========\n")
    sub_cmd = (
        f"sbatch --error={output_path}_%j_stderr.out --output={output_path}_%j_stdout.out <"
        f" {script_path}"
    )
    print(sub_cmd)

    ret = subprocess.run(sub_cmd, shell=True, cwd=os.getcwd(), capture_output=True)
    print(ret.stdout.decode())

In [8]:
# HIER AENDERN:
# Thore: range(0,10) + range(10,25)
# Lukas: range(25,50)
# Ben: range(50, 75)
# Jakob: range(75, 100)

iterations = [i for i in range(0,100)] #10,100, # 100,1000
#iterations = [79, 82, 84, 86, 88, 92, 99]
models = ['ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66', 
#               'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.5', 
#               'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.8'
             ]

In [None]:
import time

jobids = []
for iteration in iterations:
    for model in models:
        for partition in partitions:
            job_name = f"{iteration}_{model}_{partition}_{JOBNAME}"

            job_script = make_job_script(user=USER, job_name=job_name, iteration=iteration, model=model, partition=partition) # partition currently not used in script

            jobid = submit(
                path=f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions",
                job_name=job_name,
                job_script=job_script,
            )

            jobids.append(jobid)

print(jobids)

#!/bin/bash

#SBATCH --job-name=0_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66_0_benchmark  # Specify job name
#SBATCH --ntasks 1 
#SBATCH --cpus-per-task 16
#SBATCH --mem=32G              # Specify number of nodes
#SBATCH --time=2:30:00        # Set a limit on the total run time

source ~/miniconda3/etc/profile.d/conda.sh
conda activate retrisk

# ray start --head --num-cpus 16
python /home/buergelt/projects/cardiors/code/22_retina_phewas_evaluation/1_processing/10_benchmarks_iteration_CVD.py --iteration 0 --model ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66 --partition 0


Submission:

sbatch --error=/home/buergelt/tmp/221109/job_outputs/0_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66_0_benchmark_%j_stderr.out --output=/home/buergelt/tmp/221109/job_outputs/0_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66_0_benchmark_%j_stdout.out < /home/buergelt/tmp/221109/job_submissions/0_ImageTraining_[]_ConvNeXt_MLPHead_predictions_croprat

## Check progress

In [3]:
base_path = "/sc-projects/sc-proj-ukb-cvd"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

experiment = '221108'
experiment_path = f"{output_path}/{experiment}"
experiment_path

/sc-projects/sc-proj-ukb-cvd


'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108'

In [4]:
today = '221109'

In [5]:
from datetime import date
today = str(date.today()) if today is None else today

In [6]:
from pathlib import Path
benchmark_paths = list(Path(f"{experiment_path}/benchmarks_cvd/{today}").rglob('*.feather'))

benchmarks_df = pd.concat([pd.read_feather(p) for p in benchmark_paths], axis=0)

benchmarks_df.value_counts(["iteration"]).to_frame().sort_values("iteration")

Unnamed: 0_level_0,0
iteration,Unnamed: 1_level_1
0,108
1,108
2,108
3,108
4,108
...,...
95,108
96,108
97,108
98,108


In [7]:
missing = [i for i in range(0, 2) if i not in benchmarks_df["iteration"].unique()]
missing

[]

In [None]:
Path(f"{experiment_path}/benchmarks_cvd/{today}")

In [None]:
benchmarks_df.reset_index(drop=True).to_feather(f"{experiment_path}/benchmarks_cvd_cindex_{today}.feather")

In [None]:
benchmarks_df.groupby(["score"]).mean("cindex").sort_values("cindex")

# CLEANUP

In [None]:
1/0

In [None]:
basedir = '/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/benchmarks_cvd/221109'
for i in range(0, 1000):
    filename = f'benchmark_cindex_{today}_model_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66_iteration_{i}.feather'
    try:
        os.remove(os.path.join(basedir, filename))
    except FileNotFoundError:
        print(f'{i} not found')

# debugging

In [None]:
x = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/prediction_paths_CVD.feather')
x.head()

In [None]:
x.query('endpoint=="phecode_431-11"').path.unique()

In [None]:
path = x.iloc[0, 4]
file = f'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/coxph_cvd/predictions/{path}'

preds = pd.read_feather(file)

In [None]:
preds