# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '230426'
experiment_path = f"{output_path}/{experiment}"
print('experiment path:', experiment_path)
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
#     "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
#     "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
#    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
    "predictions": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

#partitions = [i for i in range(22)]
partitions = [4, 5, 7, 9, 10, 20] # Partitions with eye test centers

/sc-projects/sc-proj-ukb-cvd
experiment path: /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230426


In [7]:
today = '230426'

In [8]:
!ls -al {output_path}

total 33497560
drwxrwx--- 7 buergelt posix-nogroup         736 May  3 08:41 .
drwxrwx--- 4 buergelt posix-nogroup          47 Aug 24  2022 ..
drwxrwx--- 3 loockl   posix-nogroup          84 Nov 16 14:31 220812_agesexretmodel_for_reference
drwxrwx--- 5 buergelt posix-nogroup         382 Nov  3 15:51 220812_test
drwxrwx--- 8 buergelt posix-nogroup         848 Apr 13 14:15 221108
drwxrwx--- 2 loockl   posix-nogroup          31 Apr  3 14:58 230403
drwxrwx--- 4 loockl   posix-nogroup          87 May  2 14:37 230426
-rwxrwx--- 1 buergelt posix-nogroup  4936136130 Aug 24  2022 baseline_outcomes_220627.feather
-rwxrwx--- 1 buergelt posix-nogroup 10680216930 Aug 24  2022 baseline_outcomes_long_220531.feather
-rwxrwx--- 1 buergelt posix-nogroup 10702426538 Aug 24  2022 baseline_outcomes_long_220627.feather
-rwxrwx--- 1 buergelt posix-nogroup    10721490 Aug 25  2022 data_covariates_full.feather
-rwxrwx--- 1 buergelt posix-nogroup  2247373170 Aug 25  2022 eligable_eids_220824.feather
-rwxrwx--- 1

In [9]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").sort_values("endpoint")

In [10]:
endpoint_defs

Unnamed: 0,phecode,endpoint,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
3662,4306655,OMOP_4306655,All-Cause Death,Death,Both,,,
2073,001,phecode_001,Salmonella,ID,Both,0.0,001,1.0
2074,002,phecode_002,Staphylococcus,ID,Both,0.0,002,0.0
2229,002.1,phecode_002-1,Staphylococcus aureus,ID,Both,0.0,002,1.0
2075,003,phecode_003,Escherichia coli,ID,Both,0.0,003,1.0
...,...,...,...,...,...,...,...,...
3558,992,phecode_992,Family history of diseases of the skin and sub...,Stat,Both,0.0,992,1.0
3559,993,phecode_993,Family history of musculoskeletal disease,Stat,Both,0.0,993,1.0
3560,994,phecode_994,Family history of congenital anomalies,Stat,Both,0.0,994,1.0
3561,995,phecode_995,Family history of genetic condition,Stat,Both,0.0,995,1.0


In [11]:
data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_220627.feather").set_index("eid")

In [12]:
data_outcomes.head()

Unnamed: 0_level_0,OMOP_4306655_prev,phecode_202_prev,phecode_401_prev,phecode_401-1_prev,phecode_475_prev,phecode_202-2_prev,phecode_718_prev,phecode_089_prev,phecode_130_prev,phecode_713_prev,...,phecode_247-85_time,phecode_361-24_time,phecode_424-7_time,phecode_244-3_time,phecode_824-1_time,phecode_206-1_time,phecode_018_time,phecode_363-8_time,phecode_396-6_time,phecode_361-93_time
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000018,False,False,True,True,False,False,False,False,False,False,...,11.866089,11.866089,11.866089,11.866089,11.866089,11.866089,11.866089,11.866089,11.866089,11.866089
1000020,False,False,False,False,False,False,False,False,False,False,...,13.596446,13.596446,13.596446,13.596446,13.596446,13.596446,13.596446,13.596446,13.596446,13.596446
1000037,False,False,False,False,False,False,True,False,False,True,...,12.868163,12.868163,12.868163,12.868163,12.868163,12.868163,12.868163,12.868163,12.868163,12.868163
1000043,False,False,True,True,False,False,False,True,True,True,...,12.309629,12.309629,12.309629,12.309629,12.309629,12.309629,12.309629,12.309629,12.309629,12.309629
1000051,False,True,False,False,False,True,False,False,False,False,...,15.29121,15.29121,15.29121,15.29121,15.29121,15.29121,15.29121,15.29121,15.29121,15.29121


In [13]:
from datetime import date
today = str(date.today()) if today is None else today 

In [14]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [15]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])
len(endpoints)

1171

In [16]:
import glob, os
img_root = '/sc-projects/sc-proj-ukb-cvd/data/retina/preprocessed/preprocessed'
img_visit = 0
img_file_extension = '.png'
eids_with_retinapic = [int(fp.split('/')[-1].split('_')[0]) for fp in sorted( glob.glob(os.path.join(img_root, f'*{img_file_extension}' 
                       if img_file_extension is not None else '*'))) 
                       if f'_{img_visit}_' in fp]
len(eids_with_retinapic)

113122

In [17]:
d = []
for endpoint in tqdm(endpoints):
    s = data_outcomes[f"{endpoint}_event"].loc[np.intersect1d(eids_dict[endpoint], eids_with_retinapic)]   # .loc[eids_dict[endpoint]]
    n = s.sum()
    freq = s.mean()
    d.append({"endpoint": endpoint, "eligable":len(np.intersect1d(eids_dict[endpoint], eids_with_retinapic)), "n": n, "freq": freq})

  0%|          | 0/1171 [00:00<?, ?it/s]

In [18]:
endpoints_freqs = pd.DataFrame().from_dict(d)

In [19]:
endpoints_ds = endpoints_freqs.query("n>100").sort_values("endpoint").reset_index(drop=True).merge(endpoint_defs)
endpoints_ds

Unnamed: 0,endpoint,eligable,n,freq,phecode,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
0,OMOP_4306655,61256,3490,0.056974,4306655,All-Cause Death,Death,Both,,,
1,phecode_002,60945,658,0.010797,002,Staphylococcus,ID,Both,0.0,002,0.0
2,phecode_002-1,61010,486,0.007966,002.1,Staphylococcus aureus,ID,Both,0.0,002,1.0
3,phecode_003,60757,959,0.015784,003,Escherichia coli,ID,Both,0.0,003,1.0
4,phecode_004,60584,494,0.008154,004,Streptococcus,ID,Both,0.0,004,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1165,phecode_977-52,31669,520,0.016420,977.52,Hormone replacement therapy (postmenopausal),Rx,Female,0.0,977,1.0
1166,phecode_977-7,60032,2231,0.037164,977.7,Long term (current) use of insulin or oral hyp...,Rx,Both,0.0,977,0.0
1167,phecode_977-71,60936,472,0.007746,977.71,Long term (current) use of insulin,Rx,Both,0.0,977,1.0
1168,phecode_977-72,60207,2148,0.035677,977.72,Long term (current) use of oral hypoglycemic d...,Rx,Both,0.0,977,1.0


In [20]:
endpoints_ds.to_csv(f"{experiment_path}/endpoints.csv")

In [21]:
f"{experiment_path}/endpoints.csv"

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230426/endpoints.csv'

In [22]:
# merge with icd10 definitions
phecode_icd10_mapping = pd.read_csv("/sc-projects/sc-proj-ukb-cvd/data/mapping/phecodes/ICD10_to_phecode_V2.csv", dtype={"phecode":str}).assign(endpoint = lambda x: "phecode_"+x.phecode)[["endpoint", "icd10"]].groupby("endpoint").agg(list)
phecode_icd10_mapping['endpoint'] = [e.replace('.', '-') for e in phecode_icd10_mapping.index.values]
phecode_icd10_mapping

Unnamed: 0_level_0,icd10,endpoint
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1
phecode_001,"[A01, A01.0, A01.00, A01.01, A01.02, A01.03, A...",phecode_001
phecode_002,"[A05.0, A41.0, A41.01, A41.02, A41.1, A41.2, A...",phecode_002
phecode_002.1,"[A41.0, A41.01, A41.02, A49.01, A49.02, B95.6,...",phecode_002-1
phecode_003,"[A04.0, A04.1, A04.2, A04.3, A04.4, A41.51, B9...",phecode_003
phecode_004,"[A38, A38.0, A38.1, A38.8, A38.9, A40, A40.0, ...",phecode_004
...,...,...
phecode_992,[Z84.0],phecode_992
phecode_993,"[Z82.6, Z82.61, Z82.62, Z82.69]",phecode_993
phecode_994,"[Z82.7, Z82.79]",phecode_994
phecode_995,[Z84.81],phecode_995


In [23]:
st1 = endpoints_ds.merge(phecode_icd10_mapping.reset_index(drop=True), how='left', on='endpoint')
systems = ['OMOP' if 'OMOP' in st1.loc[n].endpoint else 'Phecode' for n in range(len(st1))]
st1['system'] = systems
st1 = st1[['system', 'phecode', 'phecode_string', 'phecode_category', 'eligable', 'n', 'freq', 'icd10']].reset_index(drop=True)
#st1.to_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/ST1_UKB_retina_EndpointDefinition.csv')
st1.to_csv('../2_figures/outputs/ST1_UKB_Retina_EndpointDefinition.csv', index=False)

In [24]:
st1

Unnamed: 0,system,phecode,phecode_string,phecode_category,eligable,n,freq,icd10
0,OMOP,4306655,All-Cause Death,Death,61256,3490,0.056974,
1,Phecode,002,Staphylococcus,ID,60945,658,0.010797,"[A05.0, A41.0, A41.01, A41.02, A41.1, A41.2, A..."
2,Phecode,002.1,Staphylococcus aureus,ID,61010,486,0.007966,"[A41.0, A41.01, A41.02, A49.01, A49.02, B95.6,..."
3,Phecode,003,Escherichia coli,ID,60757,959,0.015784,"[A04.0, A04.1, A04.2, A04.3, A04.4, A41.51, B9..."
4,Phecode,004,Streptococcus,ID,60584,494,0.008154,"[A38, A38.0, A38.1, A38.8, A38.9, A40, A40.0, ..."
...,...,...,...,...,...,...,...,...
1165,Phecode,977.52,Hormone replacement therapy (postmenopausal),Rx,31669,520,0.016420,[Z79.890]
1166,Phecode,977.7,Long term (current) use of insulin or oral hyp...,Rx,60032,2231,0.037164,"[T85.614, T85.614A, T85.614D, T85.614S, T85.62..."
1167,Phecode,977.71,Long term (current) use of insulin,Rx,60936,472,0.007746,"[T85.614, T85.614A, T85.614D, T85.614S, T85.62..."
1168,Phecode,977.72,Long term (current) use of oral hypoglycemic d...,Rx,60207,2148,0.035677,[Z79.84]
