# Benchmark Models

## Initialize

In [4]:
%load_ext autoreload
%autoreload 2

import os
from tqdm.auto import tqdm
import pathlib
import datetime
import subprocess
import numpy as np
import pandas as pd
import lifelines

In [5]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas_220603_fullrun"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

USER = 'loockl'
BASE = pathlib.Path(f"/home/{USER}/")
EXPERIMENT_NAME = '220603_fullrun'
#TEMPLATE_CONFIG = f"{BASE}/config/"  # template yaml to use
TRAIN_SCRIPT = f"{BASE}/riskiano/riskiano/experiments/lukas/phewas/22_retina_phewas_notebooks/1_processing/10_benchmarks_iteration.py"
ACTIVATE_ENV_CMD = """conda activate retina_risk"""

TAG = '220603_fullrun'
JOBNAME = 'benchmark'

name_dict = {
    "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
    "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
    "predictions_cropratio0.8": "ConvNextSmall(Retina)+MLP_cropratio0.8",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

## Submit Benchmark jobs

In [16]:
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_configs", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs", exist_ok=True)

In [26]:
def make_job_script(user, job_name, iteration, model, partition):

    job_script_str = (
        f"""#!/bin/bash

#SBATCH --job-name={job_name}  # Specify job name
#SBATCH --nodes=1              # Specify number of nodes
#SBATCH --mem=500G              # Specify number of nodes
#SBATCH --time=2:30:00        # Set a limit on the total run time
#SBATCH --tasks-per-node=1
#SBATCH --exclusive

{ACTIVATE_ENV_CMD}

ray start --head --port=6378 --num-cpus 16
python {TRAIN_SCRIPT} --iteration {iteration} --model {model} --partition {partition}"""
            )

    return job_script_str


In [18]:
def submit(path, job_name, job_script, time_stamp=None):
    if not time_stamp:
        time_stamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    script_path_long = f"{path}/{job_name}_{time_stamp}.sh"

    with open(script_path_long, "w") as outfile:
        outfile.write(job_script)
    script_path = f"{path}/{job_name}.sh"
    try:
        os.unlink(script_path)
    except FileNotFoundError:  # because we cannot overwrite symlinks directly
        pass
    os.symlink(os.path.realpath(script_path_long), script_path)

    output_path = f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs/{job_name}"

    print(job_script)
    print("\n\nSubmission:\n===========\n")
    sub_cmd = (
        f"sbatch --error={output_path}_%j_stderr.out --output={output_path}_%j_stdout.out <"
        f" {script_path}"
    )
    print(sub_cmd)

    ret = subprocess.run(sub_cmd, shell=True, cwd=os.getcwd(), capture_output=True)
    print(ret.stdout.decode())

In [27]:
iterations = [i for i in range(1,50)] #10,100, # 100,1000
partitions = [i for i in range(22)]
models = ['ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.3', 
              'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.5', 
              'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.8'
             ]

In [28]:
import time

jobids = []
for iteration in iterations:
    for model in models:
        partition = '0' #for partition in partitions:
        
        job_name = f"{iteration}_{model}_{partition}_{JOBNAME}"

        job_script = make_job_script(user=USER, job_name=job_name, iteration=iteration, model=model, partition=partition) # partition currently not used in script

        jobid = submit(
            path=f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions",
            job_name=job_name,
            job_script=job_script,
        )

        jobids.append(jobid)

print(jobids)

#!/bin/bash

#SBATCH --job-name=1_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.3_0_benchmark  # Specify job name
#SBATCH --nodes=1              # Specify number of nodes
#SBATCH --mem=500G              # Specify number of nodes
#SBATCH --time=2:30:00        # Set a limit on the total run time
#SBATCH --tasks-per-node=1
#SBATCH --exclusive

conda activate retina_risk

ray start --head --port=6378 --num-cpus 16
python /home/loockl/riskiano/riskiano/experiments/lukas/phewas/22_retina_phewas_notebooks/1_processing/10_benchmarks_iteration.py --iteration 1 --model ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.3 --partition 0


Submission:

sbatch --error=/home/loockl/tmp/220603_fullrun/job_outputs/1_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.3_0_benchmark_%j_stderr.out --output=/home/loockl/tmp/220603_fullrun/job_outputs/1_ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.3_0_benchmark_%j_stdout.out < /home/loockl/tmp/220603_fullrun/job_submissions

## Check progress

In [1]:
base_path = "/sc-projects/sc-proj-ukb-cvd"
print(base_path)

project_label = "22_retina_phewas_220603_fullrun"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

experiment = '220603_fullrun'
experiment_path = f"{output_path}/{experiment}"
experiment_path

/sc-projects/sc-proj-ukb-cvd


'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas_220603_fullrun/data/220603_fullrun'

In [8]:
from datetime import date
#today = str(date.today())
today = '2022-07-12'

In [9]:
from pathlib import Path
benchmark_paths = list(Path(f"{experiment_path}/benchmarks/{today}").rglob('*.feather'))

benchmarks_df = pd.concat([pd.read_feather(p) for p in benchmark_paths], axis=0)

benchmarks_df.value_counts(["iteration"]).to_frame().sort_values("iteration")

Unnamed: 0_level_0,0
iteration,Unnamed: 1_level_1
0,31617
1,31617
2,31617
3,31617
4,31617
5,31617
6,31617
7,31617
8,31617
9,31617


In [10]:
benchmarks_df.reset_index(drop=True).to_feather(f"{experiment_path}/benchmarks_cindex_{today}.feather")

In [11]:
benchmarks_df.groupby(["score"]).mean("cindex").sort_values("cindex")

Unnamed: 0_level_0,iteration,time,cindex
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Retina,24.5,10.0,0.564257
Age+Sex,24.5,10.0,0.604635
Age+Sex+Retina,24.5,10.0,0.61007
SCORE2,24.5,10.0,0.621229
SCORE2+Retina,24.5,10.0,0.623006
ASCVD,24.5,10.0,0.626902
ASCVD+Retina,24.5,10.0,0.627034
QRISK3+Retina,24.5,10.0,0.638689
QRISK3,24.5,10.0,0.638777


# old stuff

In [None]:
%env MKL_NUM_THREADS=1
%env NUMEXPR_NUM_THREADS=1
%env OMP_NUM_THREADS=1

env: MKL_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1


In [None]:
ray.shutdown()

NameError: name 'ray' is not defined

In [None]:
import ray
ray.init(num_cpus=24)#, webui_url="0.0.0.0")

RayContext(dashboard_url='', python_version='3.9.7', ray_version='1.12.1', ray_commit='4863e33856b54ccf8add5cbe75e41558850a1b75', address_info={'node_ip_address': '10.32.105.2', 'raylet_ip_address': '10.32.105.2', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-06-08_18-22-29_185810_1027861/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-06-08_18-22-29_185810_1027861/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-06-08_18-22-29_185810_1027861', 'metrics_export_port': 49949, 'gcs_address': '10.32.105.2:58760', 'address': '10.32.105.2:58760', 'node_id': '7f1a8bf36f18cb087d1a94c5fc24115fb56181bff5980da09aa04cc0'})

In [6]:
in_path = f"{experiment_path}/coxph/predictions"

In [7]:
prediction_paths = !ls $in_path
predictions = pd.Series(prediction_paths).str.split("_", expand=True)\
    .assign(path = prediction_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(partition = lambda x: x[3].str.replace(".feather", "", regex=True).astype(int))\
    [["endpoint", "score", "partition", "path"]].sort_values(["endpoint", "score", "partition"]).reset_index(drop=True)
predictions

Unnamed: 0,endpoint,score,partition,path
0,OMOP_4306655,Age+Sex,0,OMOP_4306655_Age+Sex_0.feather
1,OMOP_4306655,Age+Sex+Retina,0,OMOP_4306655_Age+Sex+Retina_0.feather
2,OMOP_4306655,Retina,0,OMOP_4306655_Retina_0.feather
3,phecode_002,Age+Sex,0,phecode_002_Age+Sex_0.feather
4,phecode_002,Age+Sex+Retina,0,phecode_002_Age+Sex+Retina_0.feather
...,...,...,...,...
3508,phecode_977-72,Age+Sex+Retina,0,phecode_977-72_Age+Sex+Retina_0.feather
3509,phecode_977-72,Retina,0,phecode_977-72_Retina_0.feather
3510,phecode_979,Age+Sex,0,phecode_979_Age+Sex_0.feather
3511,phecode_979,Age+Sex+Retina,0,phecode_979_Age+Sex+Retina_0.feather


In [8]:
import pandas as pd
all_endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

#all_endpoints = sorted(endpoints_all_md.endpoint.to_list())
print(len(all_endpoints))

endpoints_not_overlapping_with_preds = []
#endpoints_not_overlapping_with_preds_md = pd.read_csv(f"{experiment_path}/endpoints_not_overlapping.csv", header=None)
#print(len(endpoints_not_overlapping_with_preds_md))
#endpoints_not_overlapping_with_preds = list(endpoints_not_overlapping_with_preds_md[0].values)

endpoints = []
for c in all_endpoints:
    if c not in endpoints_not_overlapping_with_preds: # this is what i want
        #print('OK    - ',c)
        endpoints.append(c)
    #if c in endpoints_not_overlapping_with_preds: # this is what causes errors!
    #    print('ERROR - ',c)
print(len(endpoints))

1171
1171


In [9]:
scores = ['Age+Sex', 'Retina', 'Age+Sex+Retina']
partitions = sorted(predictions.partition.unique().tolist())

In [10]:
from datetime import date
today = str(date.today())

In [11]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [12]:
data_outcomes = pd.read_feather(
    f"{output_path}/baseline_outcomes_220531.feather", 
    columns= ["eid"] + [f"{e}_event" for e in endpoints] + [f"{e}_time" for e in endpoints])\
    .set_index("eid")

In [13]:
eids = data_outcomes.index.values

In [14]:
def read_partitions(endpoint, score, time):
    paths = predictions.query("endpoint==@endpoint").query("score==@score").path.to_list()
    data_preds = pd.concat([pd.read_feather(f"{in_path}/{path}", columns=["eid", f"Ft_{time}"]) 
                      for path in paths], axis=0).set_index("eid").sort_index()
    data_preds.columns = ["Ft"]
    return data_preds

In [15]:
def prepare_data(data_outcomes, endpoint, score, t_eval):
    temp_preds = read_partitions(endpoint, score, t_eval)
    temp_tte = data_outcomes[[f"{endpoint}_event", f"{endpoint}_time"]]
    temp_tte.columns = ["event", "time"]
    temp_data = temp_preds.merge(temp_tte, left_index=True, right_index=True, how="left")
    
    condition = (temp_data['event'] == 0) | (temp_data['time'] > t_eval)
    
    temp_data["event"] = (np.where(condition, 0, 1))
    
    temp_data["time"] = (np.where(condition, t_eval, temp_data['time']))
    return temp_data

In [16]:
from lifelines.utils import concordance_index

def calculate_cindex(data_outcomes, endpoint, score, time, iteration, eids_i):  
    temp_data = prepare_data(data_outcomes, endpoint, score, time)
    temp_data = temp_data[temp_data.index.isin(eids_i)]
    
    try:
        cindex = 1-concordance_index(temp_data["time"], temp_data["Ft"], temp_data["event"])
    except ZeroDivisionError: 
        cindex=np.nan
    return {"endpoint":endpoint, "score": score, "iteration": iteration, "time":time, "cindex":cindex}

@ray.remote
def calculate_iteration(data_outcomes, endpoint, score, time, iteration, eids_i):  
    dicts = []
    for score in scores:
        dicts.append(calculate_cindex(data_outcomes, endpoint, score, 10, iteration, eids_i))
    return dicts

In [17]:
iterations=[i for i in range(1000)] # 100

In [18]:
out_path = f"{experiment_path}/benchmarks"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [19]:
ray_outcomes = ray.put(data_outcomes)

rows_ray = []
for endpoint in tqdm(endpoints):
    eids_e = eids_dict[endpoint]
    for iteration in iterations:
        eids_i = np.random.choice(eids_e, size=len(eids_e))
        ds = calculate_iteration.remote(ray_outcomes, endpoint, scores, 10, iteration, eids_i)
        rows_ray.append(ds)

  0%|          | 0/1171 [00:00<?, ?it/s]

In [20]:
rows = [ray.get(r) for r in tqdm(rows_ray)]

  0%|          | 0/1171000 [00:00<?, ?it/s]

In [28]:
rows_finished = [item for sublist in rows for item in sublist]

In [29]:
benchmark_endpoints = pd.DataFrame({}).append(rows_finished, ignore_index=True)

In [30]:
name = f"benchmark_cindex_agesexcoxph_{today}"
benchmark_endpoints.to_feather(f"{experiment_path}/{name}.feather")

In [31]:
print(f"{experiment_path}/{name}")

/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas_220608/data/2af9tvdp/benchmark_cindex_agesexcoxph_2022-06-08


In [32]:
len(rows_finished), len(rows)

(3513000, 1171000)

In [33]:
pd.DataFrame({}).append(rows_finished, ignore_index=True).to_feather(f"{out_path}/{endpoint}.feather")

In [27]:
%%time
temp_data = prepare_data(data_outcomes, endpoint, score, 10)

NameError: name 'score' is not defined