# Benchmarks

## Initialize

In [1]:
%load_ext autoreload
%autoreload 2

import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '230905'
experiment_path = f"{output_path}/{experiment}"
print('experiment path:', experiment_path)
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
#     "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
#     "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
#    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
    "predictions": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

#partitions = [i for i in range(22)]
partitions = [4, 5, 7, 9, 10, 20] # Partitions with eye test centers

/sc-projects/sc-proj-ukb-cvd
experiment path: /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230905


In [3]:
today = '230905'

In [4]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/230905/min100_endpoints.csv').endpoint.values)])

In [5]:
output_path

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data'

In [10]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").query("endpoint==@endpoints").sort_values("endpoint").set_index("endpoint")
endpoint_defs.head()

Unnamed: 0_level_0,phecode,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OMOP_4306655,4306655.0,All-Cause Death,Death,Both,,,
phecode_002,2.0,Staphylococcus,ID,Both,0.0,2.0,0.0
phecode_002-1,2.1,Staphylococcus aureus,ID,Both,0.0,2.0,1.0
phecode_003,3.0,Escherichia coli,ID,Both,0.0,3.0,1.0
phecode_004,4.0,Streptococcus,ID,Both,0.0,4.0,0.0


In [11]:
# data_covariates = pd.read_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/baseline_covariates_220503.feather")[["eid", "sex_f31_0_0"]].set_index("eid")
# data_covariates.head()
data_covariates = pd.read_feather(f"{output_path}/data_covariates_full.feather")[["eid", "sex"]].set_index("eid")

In [13]:
data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_long_230905.feather").set_index("eid")
data_outcomes.head()

Unnamed: 0_level_0,endpoint,prev,event,time
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000018,OMOP_4306655,False,False,12.0
1000020,OMOP_4306655,False,False,13.730322
1000037,OMOP_4306655,False,False,13.002053
1000043,OMOP_4306655,False,False,12.443532
1000051,OMOP_4306655,False,False,15.425052


In [14]:
data_all = data_outcomes.merge(data_covariates, left_index=True, right_index=True, how="left").reset_index(drop=False).set_index("endpoint")

In [15]:
data_all.head()

Unnamed: 0_level_0,eid,prev,event,time,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
OMOP_4306655,1000018,False,False,12.0,Female
phecode_001,1000018,False,False,12.0,Female
phecode_002,1000018,False,False,12.0,Female
phecode_002-1,1000018,False,False,12.0,Female
phecode_003,1000018,False,False,12.0,Female


In [16]:
data_dict = {e: df.reset_index(drop=True).set_index("eid") for e, df in data_all.groupby('endpoint')}

In [17]:
data_dict.keys()

dict_keys(['OMOP_4306655', 'phecode_001', 'phecode_002', 'phecode_002-1', 'phecode_003', 'phecode_004', 'phecode_004-1', 'phecode_004-2', 'phecode_004-3', 'phecode_004-4', 'phecode_005', 'phecode_005-1', 'phecode_005-2', 'phecode_006', 'phecode_006-1', 'phecode_006-2', 'phecode_007', 'phecode_007-1', 'phecode_009', 'phecode_010', 'phecode_011', 'phecode_012', 'phecode_013', 'phecode_014', 'phecode_015', 'phecode_015-1', 'phecode_015-2', 'phecode_016', 'phecode_016-1', 'phecode_017', 'phecode_019', 'phecode_020', 'phecode_020-1', 'phecode_021', 'phecode_021-2', 'phecode_023', 'phecode_024', 'phecode_025', 'phecode_027', 'phecode_028', 'phecode_029', 'phecode_030', 'phecode_031', 'phecode_032', 'phecode_034', 'phecode_035', 'phecode_038', 'phecode_039', 'phecode_040', 'phecode_041', 'phecode_050', 'phecode_050-1', 'phecode_050-4', 'phecode_050-5', 'phecode_050-6', 'phecode_052', 'phecode_052-1', 'phecode_052-3', 'phecode_052-31', 'phecode_052-32', 'phecode_052-4', 'phecode_052-5', 'pheco

In [18]:
#endpoints

In [22]:
endpoint_defs.sex.unique()

array(['Both', 'Female', 'Male'], dtype=object)

In [23]:
def get_eligable_eids(data_dict, endpoint):
    data_temp = data_dict[endpoint]
    eligibility = endpoint_defs.loc[endpoint]["sex"]

    if eligibility == "Both": 
        eids_incl = data_temp.copy().query(f"prev==0").index.to_list()
    else:
        eids_incl = data_temp.copy().query(f"prev==0&sex==@eligibility").index.to_list()

    return {"endpoint": endpoint, 
            "n_eids": len(eids_incl), 
            "eid_list": eids_incl}

In [24]:
#long_endpoints_unique = data_outcomes['endpoint'].unique()
#endpoints_in_long = [e for e in endpoints if e in long_endpoints_unique]

d_list = [get_eligable_eids(data_dict, endpoint) for endpoint in tqdm(endpoints)] 
eid_df = pd.DataFrame.from_dict(d_list)

  0%|          | 0/773 [00:00<?, ?it/s]

In [25]:
#eid_df.set_index("endpoint")["eid_list"].to_dict()

In [26]:
from datetime import date
today = str(date.today()) if today is None else today

In [27]:
f"{output_path}/eligable_eids_{today}.feather"

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/eligable_eids_230905.feather'

In [28]:
eid_df.to_feather(f"{output_path}/eligable_eids_{today}.feather") 
f"{output_path}/eligable_eids_{today}.feather"

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/eligable_eids_230905.feather'

In [29]:
eid_df_long = eid_df[["endpoint", "eid_list"]].explode("eid_list").reset_index(drop=True)
eid_df_long.columns = ["endpoint", "eid"]
eid_df_long["endpoint"] = eid_df_long["endpoint"].astype("category")
eid_df_long["eid"] = eid_df_long["eid"].astype("category")

In [30]:
eid_df_long.to_feather(f"{output_path}/eligable_eids_long_{today}.feather")