# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '230905'
experiment_path = f"{output_path}/{experiment}"
print('experiment path:', experiment_path)
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
#     "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
#     "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
#    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
    "predictions": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

#partitions = [i for i in range(22)]
partitions = [4, 5, 7, 9, 10, 20] # Partitions with eye test centers

/sc-projects/sc-proj-ukb-cvd
experiment path: /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230905


In [3]:
today = '230905'

In [4]:
!ls -al {output_path}

total 48768952
drwxrwx--- 8 buergelt posix-nogroup         962 Sep 14 10:14 .
drwxrwx--- 4 buergelt posix-nogroup          47 Aug 24  2022 ..
drwxrwx--- 3 loockl   posix-nogroup          84 Nov 16  2022 220812_agesexretmodel_for_reference
drwxrwx--- 5 buergelt posix-nogroup         382 Nov  3  2022 220812_test
drwxrwx--- 8 buergelt posix-nogroup         848 Apr 13 14:15 221108
drwxrwx--- 2 loockl   posix-nogroup          31 Apr  3 14:58 230403
drwxrwx--- 5 loockl   posix-nogroup         324 Jun 19 11:02 230426
drwxrwx--- 4 loockl   posix-nogroup          87 Sep 13 16:10 230905
-rwxrwx--- 1 buergelt posix-nogroup  4936136130 Aug 24  2022 baseline_outcomes_220627.feather
-rwxrwx--- 1 loockl   posix-nogroup  4618109234 Sep 14 09:43 baseline_outcomes_230905.feather
-rwxrwx--- 1 buergelt posix-nogroup 10680216930 Aug 24  2022 baseline_outcomes_long_220531.feather
-rwxrwx--- 1 buergelt posix-nogroup 10702426538 Aug 24  2022 baseline_outcomes_long_220627.feather
-rwxrwx--- 1 loockl   posix-no

In [5]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").sort_values("endpoint")

In [6]:
endpoint_defs

Unnamed: 0,phecode,endpoint,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
3662,4306655,OMOP_4306655,All-Cause Death,Death,Both,,,
2073,001,phecode_001,Salmonella,ID,Both,0.0,001,1.0
2074,002,phecode_002,Staphylococcus,ID,Both,0.0,002,0.0
2229,002.1,phecode_002-1,Staphylococcus aureus,ID,Both,0.0,002,1.0
2075,003,phecode_003,Escherichia coli,ID,Both,0.0,003,1.0
...,...,...,...,...,...,...,...,...
3558,992,phecode_992,Family history of diseases of the skin and sub...,Stat,Both,0.0,992,1.0
3559,993,phecode_993,Family history of musculoskeletal disease,Stat,Both,0.0,993,1.0
3560,994,phecode_994,Family history of congenital anomalies,Stat,Both,0.0,994,1.0
3561,995,phecode_995,Family history of genetic condition,Stat,Both,0.0,995,1.0


In [7]:
data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_230905.feather").set_index("eid")

In [8]:
data_outcomes.head()

Unnamed: 0_level_0,OMOP_4306655_prev,phecode_001_prev,phecode_002_prev,phecode_002-1_prev,phecode_003_prev,phecode_004_prev,phecode_004-1_prev,phecode_004-2_prev,phecode_004-3_prev,phecode_004-4_prev,...,phecode_986_time,phecode_987_time,phecode_988_time,phecode_989_time,phecode_990_time,phecode_991_time,phecode_992_time,phecode_993_time,phecode_994_time,phecode_997_time
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000018,False,False,False,False,False,False,False,False,False,False,...,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0
1000020,False,False,False,False,False,False,False,False,False,False,...,13.730322,13.730322,13.730322,13.730322,13.730322,13.730322,13.730322,13.730322,13.730322,13.730322
1000037,False,False,False,False,False,False,False,False,False,False,...,13.002053,13.002053,13.002053,13.002053,13.002053,13.002053,13.002053,13.002053,13.002053,13.002053
1000043,False,False,False,False,False,False,False,False,False,False,...,12.443532,12.443532,12.443532,12.443532,12.443532,12.443532,12.443532,12.443532,12.443532,12.443532
1000051,False,False,False,False,False,False,False,False,False,False,...,15.425052,15.425052,15.425052,15.425052,15.425052,15.425052,15.425052,15.425052,15.425052,15.425052


In [9]:
from datetime import date
today = str(date.today()) if today is None else today 

In [10]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [11]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/230905/min100_endpoints.csv').endpoint.values)])
len(endpoints)

773

In [12]:
import glob, os
img_root = '/sc-projects/sc-proj-ukb-cvd/data/retina/preprocessed/preprocessed'
img_visit = 0
img_file_extension = '.png'
eids_with_retinapic = [int(fp.split('/')[-1].split('_')[0]) for fp in sorted( glob.glob(os.path.join(img_root, f'*{img_file_extension}' 
                       if img_file_extension is not None else '*'))) 
                       if f'_{img_visit}_' in fp]
len(eids_with_retinapic)

113122

In [13]:
d = []
for endpoint in tqdm(endpoints):
    s = data_outcomes[f"{endpoint}_event"].loc[np.intersect1d(eids_dict[endpoint], eids_with_retinapic)]   # .loc[eids_dict[endpoint]]
    n = s.sum()
    freq = s.mean()
    d.append({"endpoint": endpoint, "eligable":len(np.intersect1d(eids_dict[endpoint], eids_with_retinapic)), "n": n, "freq": freq})

  0%|          | 0/773 [00:00<?, ?it/s]

In [15]:
endpoints_freqs = pd.DataFrame().from_dict(d)

In [16]:
endpoints_ds = endpoints_freqs.query("n>100").sort_values("endpoint").reset_index(drop=True).merge(endpoint_defs)
endpoints_ds

Unnamed: 0,endpoint,eligable,n,freq,phecode,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
0,OMOP_4306655,61256,3474,0.056713,4306655,All-Cause Death,Death,Both,,,
1,phecode_002,61048,516,0.008452,002,Staphylococcus,ID,Both,0.0,002,0.0
2,phecode_002-1,61109,342,0.005597,002.1,Staphylococcus aureus,ID,Both,0.0,002,1.0
3,phecode_003,61138,691,0.011302,003,Escherichia coli,ID,Both,0.0,003,1.0
4,phecode_004,61092,353,0.005778,004,Streptococcus,ID,Both,0.0,004,0.0
...,...,...,...,...,...,...,...,...,...,...,...
747,phecode_981,60339,1674,0.027743,981,Family history of malignant neoplasm,Stat,Both,0.0,981,1.0
748,phecode_983,61221,125,0.002042,983,Family history of other endocrine and metaboli...,Stat,Both,0.0,983,1.0
749,phecode_988,60407,2249,0.037231,988,Family history of cardiovascular disease,Stat,Both,0.0,988,1.0
750,phecode_990,61196,122,0.001994,990,Family history of digestive disorders,Stat,Both,0.0,990,1.0


In [17]:
endpoints_ds.to_csv(f"{experiment_path}/endpoints.csv")

In [18]:
f"{experiment_path}/endpoints.csv"

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230905/endpoints.csv'

In [19]:
# merge with icd10 definitions
phecode_icd10_mapping = pd.read_csv("/sc-projects/sc-proj-ukb-cvd/data/mapping/phecodes/ICD10_to_phecode_V2.csv", dtype={"phecode":str}).assign(endpoint = lambda x: "phecode_"+x.phecode)[["endpoint", "icd10"]].groupby("endpoint").agg(list)
phecode_icd10_mapping['endpoint'] = [e.replace('.', '-') for e in phecode_icd10_mapping.index.values]
phecode_icd10_mapping

Unnamed: 0_level_0,icd10,endpoint
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1
phecode_001,"[A01, A01.0, A01.00, A01.01, A01.02, A01.03, A...",phecode_001
phecode_002,"[A05.0, A41.0, A41.01, A41.02, A41.1, A41.2, A...",phecode_002
phecode_002.1,"[A41.0, A41.01, A41.02, A49.01, A49.02, B95.6,...",phecode_002-1
phecode_003,"[A04.0, A04.1, A04.2, A04.3, A04.4, A41.51, B9...",phecode_003
phecode_004,"[A38, A38.0, A38.1, A38.8, A38.9, A40, A40.0, ...",phecode_004
...,...,...
phecode_992,[Z84.0],phecode_992
phecode_993,"[Z82.6, Z82.61, Z82.62, Z82.69]",phecode_993
phecode_994,"[Z82.7, Z82.79]",phecode_994
phecode_995,[Z84.81],phecode_995


In [20]:
st1 = endpoints_ds.merge(phecode_icd10_mapping.reset_index(drop=True), how='left', on='endpoint')
systems = ['OMOP' if 'OMOP' in st1.loc[n].endpoint else 'Phecode' for n in range(len(st1))]
st1['system'] = systems
st1 = st1[['system', 'phecode', 'phecode_string', 'phecode_category', 'eligable', 'n', 'freq', 'icd10']].reset_index(drop=True)
#st1.to_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/ST1_UKB_retina_EndpointDefinition.csv')
st1.to_csv('../2_figures/outputs/ST1_UKB_Retina_EndpointDefinition.csv', index=False)

In [21]:
st1

Unnamed: 0,system,phecode,phecode_string,phecode_category,eligable,n,freq,icd10
0,OMOP,4306655,All-Cause Death,Death,61256,3474,0.056713,
1,Phecode,002,Staphylococcus,ID,61048,516,0.008452,"[A05.0, A41.0, A41.01, A41.02, A41.1, A41.2, A..."
2,Phecode,002.1,Staphylococcus aureus,ID,61109,342,0.005597,"[A41.0, A41.01, A41.02, A49.01, A49.02, B95.6,..."
3,Phecode,003,Escherichia coli,ID,61138,691,0.011302,"[A04.0, A04.1, A04.2, A04.3, A04.4, A41.51, B9..."
4,Phecode,004,Streptococcus,ID,61092,353,0.005778,"[A38, A38.0, A38.1, A38.8, A38.9, A40, A40.0, ..."
...,...,...,...,...,...,...,...,...
747,Phecode,981,Family history of malignant neoplasm,Stat,60339,1674,0.027743,"[Z80, Z80.0, Z80.1, Z80.2, Z80.3, Z80.4, Z80.4..."
748,Phecode,983,Family history of other endocrine and metaboli...,Stat,61221,125,0.002042,"[Z83.3, Z83.4, Z83.42, Z83.43, Z83.430, Z83.43..."
749,Phecode,988,Family history of cardiovascular disease,Stat,60407,2249,0.037231,"[Z82.3, Z82.4, Z82.41, Z82.49]"
750,Phecode,990,Family history of digestive disorders,Stat,61196,122,0.001994,"[Z83.7, Z83.71, Z83.79]"
