# Benchmark Models

## Initialize

In [1]:
%load_ext autoreload
%autoreload 2

import os
from tqdm.auto import tqdm
import pathlib
import datetime
import subprocess
import numpy as np
import pandas as pd
import lifelines

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '230905'
experiment_path = f"{output_path}/{experiment}"
print('experiment path:', experiment_path)
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
#     "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
#     "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
#    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
    "predictions": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

#partitions = [i for i in range(22)]
partitions = [4, 5, 7, 9, 10, 20] # Partitions with eye test centers


USER = 'loockl'
BASE = pathlib.Path(f"/home/{USER}/")
EXPERIMENT_NAME = '230905'
#TEMPLATE_CONFIG = f"{BASE}/config/"  # template yaml to use
TRAIN_SCRIPT = f"{BASE}/retina_phewas_eval_paper_23/22_retina_phewas_evaluation-main/1_processing/10_benchmarks_iteration.py"
ACTIVATE_ENV_CMD = """conda activate retina_risk"""

TAG = '230905'
JOBNAME = 'benchmark'

/sc-projects/sc-proj-ukb-cvd
experiment path: /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230905


In [3]:
today = '230905'

## Submit Benchmark jobs

In [5]:
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_configs", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs", exist_ok=True)

In [6]:
def make_job_script(user, job_name, iteration, model, partition):

    job_script_str = (
        f"""#!/bin/bash

#SBATCH --job-name={job_name}  # Specify job name
#SBATCH --nodes=1              # Specify number of nodes
#SBATCH --mem=200G              # Specify number of nodes
#SBATCH --time=2:30:00        # Set a limit on the total run time
#SBATCH --tasks-per-node=1
#SBATCH --exclusive

source ~/miniconda3/etc/profile.d/conda.sh
{ACTIVATE_ENV_CMD}

ray start --head --port=6378 --num-cpus 16
python {TRAIN_SCRIPT} --iteration {iteration} --model {model} --partition {partition}"""
            )

    return job_script_str


In [7]:
def submit(path, job_name, job_script, time_stamp=None):
    if not time_stamp:
        time_stamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    script_path_long = f"{path}/{job_name}_{time_stamp}.sh"

    with open(script_path_long, "w") as outfile:
        outfile.write(job_script)
    script_path = f"{path}/{job_name}.sh"
    try:
        os.unlink(script_path)
    except FileNotFoundError:  # because we cannot overwrite symlinks directly
        pass
    os.symlink(os.path.realpath(script_path_long), script_path)

    output_path = f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs/{job_name}"

    print(job_script)
    print("\n\nSubmission:\n===========\n")
    sub_cmd = (
        f"sbatch --error={output_path}_%j_stderr.out --output={output_path}_%j_stdout.out <"
        f" {script_path}"
    )
    print(sub_cmd)

    ret = subprocess.run(sub_cmd, shell=True, cwd=os.getcwd(), capture_output=True)
    print(ret.stdout.decode())

In [8]:
# HIER AENDERN:
# Thore: range(0,10) + range(10,25)
# Lukas: range(25,50)
# Ben: range(50, 75)
# Jakob: range(75, 100)

iterations = [i for i in range(0,100)] #10,100, # 100,1000
partitions = [i for i in [4, 5, 7, 9, 10, 20]]
models = ['ImageTraining_[]_ConvNeXt_MLPHead_predictions',
              #'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66', 
              #'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.5', 
              #'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.8'
             ]

In [9]:
import time

jobids = []
for iteration in iterations:
    for model in models:
        for partition in partitions:
            job_name = f"{iteration}_{model}_{partition}_{JOBNAME}"

            job_script = make_job_script(user=USER, job_name=job_name, iteration=iteration, model=model, partition=partition) # partition currently not used in script

            jobid = submit(
                path=f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions",
                job_name=job_name,
                job_script=job_script,
            )

            jobids.append(jobid)

print(jobids)

#!/bin/bash

#SBATCH --job-name=0_ImageTraining_[]_ConvNeXt_MLPHead_predictions_4_benchmark  # Specify job name
#SBATCH --nodes=1              # Specify number of nodes
#SBATCH --mem=200G              # Specify number of nodes
#SBATCH --time=2:30:00        # Set a limit on the total run time
#SBATCH --tasks-per-node=1
#SBATCH --exclusive

source ~/miniconda3/etc/profile.d/conda.sh
conda activate retina_risk

ray start --head --port=6378 --num-cpus 16
python /home/loockl/retina_phewas_eval_paper_23/22_retina_phewas_evaluation-main/1_processing/10_benchmarks_iteration.py --iteration 0 --model ImageTraining_[]_ConvNeXt_MLPHead_predictions --partition 4


Submission:

sbatch --error=/home/loockl/tmp/230426/job_outputs/0_ImageTraining_[]_ConvNeXt_MLPHead_predictions_4_benchmark_%j_stderr.out --output=/home/loockl/tmp/230426/job_outputs/0_ImageTraining_[]_ConvNeXt_MLPHead_predictions_4_benchmark_%j_stdout.out < /home/loockl/tmp/230426/job_submissions/0_ImageTraining_[]_ConvNeXt_MLPHead_predi

## Check progress

In [4]:
base_path = "/sc-projects/sc-proj-ukb-cvd"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

experiment = '230905'
experiment_path = f"{output_path}/{experiment}"
experiment_path

/sc-projects/sc-proj-ukb-cvd


'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230905'

In [5]:
today = '230905' #221109

In [6]:
from datetime import date
today = str(date.today()) if today is None else today

In [7]:
from pathlib import Path
benchmark_paths = list(Path(f"{experiment_path}/benchmarks/{today}").rglob('*.feather'))

benchmarks_df = pd.concat([pd.read_feather(p) for p in benchmark_paths], axis=0)

benchmarks_df.value_counts(["iteration"]).to_frame().sort_values("iteration")

Unnamed: 0_level_0,0
iteration,Unnamed: 1_level_1
0,6768
1,6768
2,6768
3,6768
4,6768
...,...
95,6768
96,6768
97,6768
98,6768


In [8]:
Path(f"{experiment_path}/benchmarks/{today}")

PosixPath('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230905/benchmarks/230905')

In [9]:
benchmarks_df.reset_index(drop=True).to_feather(f"{experiment_path}/benchmarks_cindex_{today}.feather")

In [10]:
benchmarks_df.groupby(["score"]).mean("cindex").sort_values("cindex")

Unnamed: 0_level_0,iteration,time,cindex
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Retina,49.5,10.0,0.608685
Age+Sex,49.5,10.0,0.617782
Age+Sex+Retina,49.5,10.0,0.631226
SCORE2,49.5,10.0,0.63907
ASCVD,49.5,10.0,0.641816
SCORE2+Retina,49.5,10.0,0.644866
ASCVD+Retina,49.5,10.0,0.645261
QRISK3,49.5,10.0,0.652176
QRISK3+Retina,49.5,10.0,0.653455


# old stuff

In [21]:
%env MKL_NUM_THREADS=1
%env NUMEXPR_NUM_THREADS=1
%env OMP_NUM_THREADS=1

env: MKL_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1


In [22]:
ray.shutdown()

NameError: name 'ray' is not defined

In [23]:
import ray
ray.init(num_cpus=24)#, webui_url="0.0.0.0")

*** SIGTERM received at time=1667655368 on cpu 9 ***
PC: @     0x7f1a6427cb29  (unknown)  __xstat64
    @     0x7f1a64ec6ce0  (unknown)  (unknown)
[2022-11-05 14:36:08,200 E 2417229 2417229] logging.cc:325: *** SIGTERM received at time=1667655368 on cpu 9 ***
[2022-11-05 14:36:08,200 E 2417229 2417229] logging.cc:325: PC: @     0x7f1a6427cb29  (unknown)  __xstat64
[2022-11-05 14:36:08,200 E 2417229 2417229] logging.cc:325:     @     0x7f1a64ec6ce0  (unknown)  (unknown)
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/buergelt/miniconda3/envs/retrisk/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_2417229/1545136565.py", line 2, in <module>
    ray.init(num_cpus=24)#, webui_url="0.0.0.0")
  File "/home/buergelt/miniconda3/envs/retrisk/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/buergelt/miniconda3/envs/retrisk/lib/python3.9/site-packages/ray/worker.py", line 1097, in init
    connect(
  File "/home/buergelt/miniconda3/envs/retrisk/lib/python3.9/site-packages/ray/worker.py", line 1576, in connect
    worker.core_worker = ray._raylet.CoreWorker(
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/buergelt/miniconda3/envs/retrisk/lib/python3.9/site-packages/IPython/core/in

TypeError: object of type 'NoneType' has no len()

In [None]:
in_path = f"{experiment_path}/coxph/predictions"

In [None]:
prediction_paths = !ls $in_path
predictions = pd.Series(prediction_paths).str.split("_", expand=True)\
    .assign(path = prediction_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(partition = lambda x: x[3].str.replace(".feather", "", regex=True).astype(int))\
    [["endpoint", "score", "partition", "path"]].sort_values(["endpoint", "score", "partition"]).reset_index(drop=True)
predictions

In [None]:
import pandas as pd
all_endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

#all_endpoints = sorted(endpoints_all_md.endpoint.to_list())
print(len(all_endpoints))

endpoints_not_overlapping_with_preds = []
#endpoints_not_overlapping_with_preds_md = pd.read_csv(f"{experiment_path}/endpoints_not_overlapping.csv", header=None)
#print(len(endpoints_not_overlapping_with_preds_md))
#endpoints_not_overlapping_with_preds = list(endpoints_not_overlapping_with_preds_md[0].values)

endpoints = []
for c in all_endpoints:
    if c not in endpoints_not_overlapping_with_preds: # this is what i want
        #print('OK    - ',c)
        endpoints.append(c)
    #if c in endpoints_not_overlapping_with_preds: # this is what causes errors!
    #    print('ERROR - ',c)
print(len(endpoints))

In [None]:
scores = ['Age+Sex', 'Retina', 'Age+Sex+Retina']
partitions = sorted(predictions.partition.unique().tolist())

In [None]:
from datetime import date
today = str(date.today())

In [None]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [None]:
data_outcomes = pd.read_feather(
    f"{output_path}/baseline_outcomes_220531.feather", 
    columns= ["eid"] + [f"{e}_event" for e in endpoints] + [f"{e}_time" for e in endpoints])\
    .set_index("eid")

In [None]:
eids = data_outcomes.index.values

In [None]:
def read_partitions(endpoint, score, time):
    paths = predictions.query("endpoint==@endpoint").query("score==@score").path.to_list()
    data_preds = pd.concat([pd.read_feather(f"{in_path}/{path}", columns=["eid", f"Ft_{time}"]) 
                      for path in paths], axis=0).set_index("eid").sort_index()
    data_preds.columns = ["Ft"]
    return data_preds

In [None]:
def prepare_data(data_outcomes, endpoint, score, t_eval):
    temp_preds = read_partitions(endpoint, score, t_eval)
    temp_tte = data_outcomes[[f"{endpoint}_event", f"{endpoint}_time"]]
    temp_tte.columns = ["event", "time"]
    temp_data = temp_preds.merge(temp_tte, left_index=True, right_index=True, how="left")
    
    condition = (temp_data['event'] == 0) | (temp_data['time'] > t_eval)
    
    temp_data["event"] = (np.where(condition, 0, 1))
    
    temp_data["time"] = (np.where(condition, t_eval, temp_data['time']))
    return temp_data

In [None]:
from lifelines.utils import concordance_index

def calculate_cindex(data_outcomes, endpoint, score, time, iteration, eids_i):  
    temp_data = prepare_data(data_outcomes, endpoint, score, time)
    temp_data = temp_data[temp_data.index.isin(eids_i)]
    
    try:
        cindex = 1-concordance_index(temp_data["time"], temp_data["Ft"], temp_data["event"])
    except ZeroDivisionError: 
        cindex=np.nan
    return {"endpoint":endpoint, "score": score, "iteration": iteration, "time":time, "cindex":cindex}

@ray.remote
def calculate_iteration(data_outcomes, endpoint, score, time, iteration, eids_i):  
    dicts = []
    for score in scores:
        dicts.append(calculate_cindex(data_outcomes, endpoint, score, 10, iteration, eids_i))
    return dicts

In [None]:
iterations=[i for i in range(1000)] # 100

In [None]:
out_path = f"{experiment_path}/benchmarks"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [None]:
ray_outcomes = ray.put(data_outcomes)

rows_ray = []
for endpoint in tqdm(endpoints):
    eids_e = eids_dict[endpoint]
    for iteration in iterations:
        eids_i = np.random.choice(eids_e, size=len(eids_e))
        ds = calculate_iteration.remote(ray_outcomes, endpoint, scores, 10, iteration, eids_i)
        rows_ray.append(ds)

In [None]:
rows = [ray.get(r) for r in tqdm(rows_ray)]

In [None]:
rows_finished = [item for sublist in rows for item in sublist]

In [None]:
benchmark_endpoints = pd.DataFrame({}).append(rows_finished, ignore_index=True)

In [None]:
name = f"benchmark_cindex_agesexcoxph_{today}"
benchmark_endpoints.to_feather(f"{experiment_path}/{name}.feather")

In [None]:
print(f"{experiment_path}/{name}")

In [None]:
len(rows_finished), len(rows)

In [None]:
pd.DataFrame({}).append(rows_finished, ignore_index=True).to_feather(f"{out_path}/{endpoint}.feather")

In [None]:
%%time
temp_data = prepare_data(data_outcomes, endpoint, score, 10)

In [None]:
# cancel jobs


for i in range(528114, 528500):
    sub_cmd =  f"scancel {i}"
    subprocess.run(sub_cmd, shell=True, cwd=os.getcwd(), capture_output=False)
    
    