# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas_220603_fullrun"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '220603_fullrun'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
    "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
    "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
    "predictions_cropratio0.8": "ConvNextSmall(Retina)+MLP_cropratio0.8",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [3]:
!ls -al {output_path}

total 35468184
drwxrwx--- 3 loockl   posix-nogroup         586 Jul  5 12:30 .
drwxrwx--- 4 loockl   posix-nogroup          47 Jun 22 20:19 ..
drwxrwx--- 7 loockl   posix-nogroup         398 Jul  5 11:57 220603_fullrun
-rwxrwx--- 1 loockl   posix-nogroup  4913114786 Jun 23 13:31 baseline_outcomes_220531.feather
-rwxrwx--- 1 loockl   posix-nogroup  4936136130 Jul  5 12:25 baseline_outcomes_220627.feather
-rwxrwx--- 1 loockl   posix-nogroup 10680216930 Jun 23 13:32 baseline_outcomes_long_220531.feather
-rwxrwx--- 1 loockl   posix-nogroup 10702426538 Jul  5 12:28 baseline_outcomes_long_220627.feather
-rwxrwx--- 1 loockl   posix-nogroup   200319338 Jul  5 12:30 baseline_records_220627.feather
-rwxrwx--- 1 loockl   posix-nogroup  2247323418 Jun 23 13:41 eligable_eids_2022-06-23.feather
-rwxrwx--- 1 loockl   posix-nogroup  2247323418 Jul  1 01:04 eligable_eids_2022-07-01.feather
-rwxrwx--- 1 loockl   posix-nogroup  2233384746 Jun 23 13:44 eligable_eids_long_2022-06-23.feather
-rwxrwx--- 1 loo

In [4]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").sort_values("endpoint")

In [5]:
data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_220531.feather").set_index("eid")

In [6]:
from datetime import date
today = str(date.today())
#today = '2022-07-01'

In [7]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [8]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

In [9]:
import glob, os
img_root = '/sc-projects/sc-proj-ukb-cvd/data/retina/preprocessed/preprocessed'
img_visit = 0
img_file_extension = '.png'
eids_with_retinapic = [int(fp.split('/')[-1].split('_')[0]) for fp in sorted( glob.glob(os.path.join(img_root, f'*{img_file_extension}' 
                       if img_file_extension is not None else '*'))) 
                       if f'_{img_visit}_' in fp]
len(eids_with_retinapic)

113122

In [10]:
d = []
for endpoint in tqdm(endpoints):
    s = data_outcomes[f"{endpoint}_event"].loc[np.intersect1d(eids_dict[endpoint], eids_with_retinapic)]   # .loc[eids_dict[endpoint]]
    n = s.sum()
    freq = s.mean()
    d.append({"endpoint": endpoint, "eligable":len(np.intersect1d(eids_dict[endpoint], eids_with_retinapic)), "n": n, "freq": freq})

  0%|          | 0/1171 [00:00<?, ?it/s]

In [11]:
endpoints_freqs = pd.DataFrame().from_dict(d)

In [12]:
endpoints_ds = endpoints_freqs.query("n>100").sort_values("endpoint").reset_index(drop=True).merge(endpoint_defs)
endpoints_ds

Unnamed: 0,endpoint,eligable,n,freq,phecode,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
0,OMOP_4306655,61256,3548,0.057921,4306655,All-Cause Death,Death,Both,,,
1,phecode_002,60945,658,0.010797,002,Staphylococcus,ID,Both,0.0,002,0.0
2,phecode_002-1,61010,486,0.007966,002.1,Staphylococcus aureus,ID,Both,0.0,002,1.0
3,phecode_003,60757,1017,0.016739,003,Escherichia coli,ID,Both,0.0,003,1.0
4,phecode_004,60584,494,0.008154,004,Streptococcus,ID,Both,0.0,004,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1166,phecode_977-52,31669,520,0.016420,977.52,Hormone replacement therapy (postmenopausal),Rx,Female,0.0,977,1.0
1167,phecode_977-7,60032,2231,0.037164,977.7,Long term (current) use of insulin or oral hyp...,Rx,Both,0.0,977,0.0
1168,phecode_977-71,60936,472,0.007746,977.71,Long term (current) use of insulin,Rx,Both,0.0,977,1.0
1169,phecode_977-72,60207,2148,0.035677,977.72,Long term (current) use of oral hypoglycemic d...,Rx,Both,0.0,977,1.0


In [13]:
endpoints_ds.to_csv(f"{experiment_path}/endpoints.csv")

In [14]:
f"{experiment_path}/endpoints.csv"

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas_220603_fullrun/data/220603_fullrun/endpoints.csv'