# Benchmarks

## Initialize

In [1]:
%load_ext autoreload
%autoreload 2

import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas_220603_fullrun"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '220603_fullrun'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
    "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
    "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
    "predictions_cropratio0.8": "ConvNextSmall(Retina)+MLP_cropratio0.8",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [3]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

In [4]:
output_path

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas_220603_fullrun/data'

In [5]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").query("endpoint==@endpoints").sort_values("endpoint").set_index("endpoint")
endpoint_defs.head()

Unnamed: 0_level_0,phecode,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OMOP_4306655,4306655.0,All-Cause Death,Death,Both,,,
phecode_002,2.0,Staphylococcus,ID,Both,0.0,2.0,0.0
phecode_002-1,2.1,Staphylococcus aureus,ID,Both,0.0,2.0,1.0
phecode_003,3.0,Escherichia coli,ID,Both,0.0,3.0,1.0
phecode_004,4.0,Streptococcus,ID,Both,0.0,4.0,0.0


In [6]:
data_covariates = pd.read_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/baseline_covariates_220503.feather")[["eid", "sex_f31_0_0"]].set_index("eid")
data_covariates.head()

Unnamed: 0_level_0,sex_f31_0_0
eid,Unnamed: 1_level_1
1000018,Female
1000020,Male
1000037,Female
1000043,Male
1000051,Female


In [7]:
data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_long_220531.feather").set_index("eid")
data_outcomes.head()

Unnamed: 0_level_0,endpoint,prev,event,time
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000018,OMOP_4306655,False,False,11.866089
1000020,OMOP_4306655,False,False,13.596446
1000037,OMOP_4306655,False,False,12.868163
1000043,OMOP_4306655,False,False,12.309629
1000051,OMOP_4306655,False,False,15.29121


In [8]:
data_all = data_outcomes.merge(data_covariates, left_index=True, right_index=True, how="left").reset_index(drop=False).set_index("endpoint")

In [9]:
data_all.head()

Unnamed: 0_level_0,eid,prev,event,time,sex_f31_0_0
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
OMOP_4306655,1000018,False,False,11.866089,Female
phecode_001,1000018,False,False,11.866089,Female
phecode_002,1000018,False,False,11.866089,Female
phecode_002-1,1000018,False,False,11.866089,Female
phecode_003,1000018,False,False,11.866089,Female


In [10]:
data_dict = {e: df.reset_index(drop=True).set_index("eid") for e, df in data_all.groupby('endpoint')}
#data_dict.keys()

In [11]:
#endpoints

In [12]:
endpoint_defs.sex.unique()

array(['Both', 'Female', 'Male'], dtype=object)

In [13]:
def get_eligable_eids(data_dict, endpoint):

    data_temp = data_dict[endpoint]
    eligibility = endpoint_defs.loc[endpoint]["sex"]
    
    if eligibility == "Both": 
        eids_incl = data_temp.copy().query(f"prev==0").index.to_list()
    else:
        eids_incl = data_temp.copy().query(f"prev==0&sex_f31_0_0==@eligibility").index.to_list()
        
    return {"endpoint": endpoint, 
            "n_eids": len(eids_incl), 
            "eid_list": eids_incl}

In [14]:
#long_endpoints_unique = data_outcomes['endpoint'].unique()
#endpoints_in_long = [e for e in endpoints if e in long_endpoints_unique]

d_list = [get_eligable_eids(data_dict, endpoint) for endpoint in tqdm(endpoints)] 
eid_df = pd.DataFrame.from_dict(d_list)

  0%|          | 0/1171 [00:00<?, ?it/s]

In [15]:
#eid_df.set_index("endpoint")["eid_list"].to_dict()

In [16]:
from datetime import date
today = str(date.today())

In [17]:
eid_df.to_feather(f"{output_path}/eligable_eids_{today}.feather") 

In [18]:
eid_df_long = eid_df[["endpoint", "eid_list"]].explode("eid_list").reset_index(drop=True)
eid_df_long.columns = ["endpoint", "eid"]
eid_df_long["endpoint"] = eid_df_long["endpoint"].astype("category")
eid_df_long["eid"] = eid_df_long["eid"].astype("category")

In [19]:
eid_df_long.to_feather(f"{output_path}/eligable_eids_long_{today}.feather")