# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '230905'
experiment_path = f"{output_path}/{experiment}"
print('experiment path:', experiment_path)
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
#     "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
#     "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
#    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
    "predictions": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

#partitions = [i for i in range(22)]
partitions = [4, 5, 7, 9, 10, 20] # Partitions with eye test centers

/sc-projects/sc-proj-ukb-cvd
experiment path: /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230905


In [3]:
today = '230905'

In [4]:
splits = ["train", "valid", 'test'] # "test_left", 'test_right']

In [5]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv(f'/sc-projects/sc-proj-ukb-cvd/results/projects/{project_label}/data/{today}/endpoints.csv').endpoint.values)])

In [6]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").query("endpoint==@endpoints").sort_values("endpoint").set_index("endpoint")

In [7]:
from datetime import date
today = str(date.today()) if today is None else today


In [8]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [9]:
%env MKL_NUM_THREADS=4
%env NUMEXPR_NUM_THREADS=4
%env OMP_NUM_THREADS=4

env: MKL_NUM_THREADS=4
env: NUMEXPR_NUM_THREADS=4
env: OMP_NUM_THREADS=4


In [10]:
#ray.shutdown()

In [11]:
import ray
#ray.init(address='auto') #, dashboard_port=24762, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))
ray.init(num_cpus=16)

RayContext(dashboard_url='', python_version='3.9.7', ray_version='1.12.1', ray_commit='4863e33856b54ccf8add5cbe75e41558850a1b75', address_info={'node_ip_address': '10.32.105.11', 'raylet_ip_address': '10.32.105.11', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2023-09-18_10-36-25_224260_1575641/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2023-09-18_10-36-25_224260_1575641/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2023-09-18_10-36-25_224260_1575641', 'metrics_export_port': 61019, 'gcs_address': '10.32.105.11:54073', 'address': '10.32.105.11:54073', 'node_id': '3c1dad185cfac0983bbff68de812423019b09f8e1be4016054d03f07'})

# Load COX predictions

In [12]:
in_path = pathlib.Path(f"{experiment_path}/coxph/input")
model_path = f"{experiment_path}/coxph/models"

out_path = f"{experiment_path}/coxph/predictions"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [13]:
import pickle
import zstandard

def load_pickle(fp):
    with open(fp, "rb") as fh:
        dctx = zstandard.ZstdDecompressor()
        with dctx.stream_reader(fh) as decompressor:
            data = pickle.loads(decompressor.read())
    return data

In [14]:
cox_paths = !ls $model_path
cox_paths = [p for p in cox_paths if "_Retina" in p or "+Retina" in p or "I(" in p]
cox = pd.Series(cox_paths).str.split("_", expand=True)\
    .assign(path = cox_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(model = lambda x: x[3]+"_"+x[4]+"_"+x[5]+"_"+x[6]+"_"+x[7])\
    .assign(partition = lambda x: x[8].str.replace(".p", "", regex=True).astype(int))\
    [["endpoint", "score", "model", "partition", "path"]].sort_values(["endpoint", "score", "partition"])\
    .query("endpoint ==@ endpoints")\
    .reset_index(drop=True)
cox

Unnamed: 0,endpoint,score,model,partition,path
0,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,4,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
1,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,5,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
2,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,7,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
3,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,9,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
4,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,10,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
...,...,...,...,...,...
22555,phecode_997,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,5,phecode_997_SCORE2+Retina_ImageTraining_[]_Con...
22556,phecode_997,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,7,phecode_997_SCORE2+Retina_ImageTraining_[]_Con...
22557,phecode_997,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,9,phecode_997_SCORE2+Retina_ImageTraining_[]_Con...
22558,phecode_997,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,10,phecode_997_SCORE2+Retina_ImageTraining_[]_Con...


In [15]:
#endpoints = sorted(cox.endpoint.unique().tolist())
scores = sorted(cox.score.unique().tolist())
partitions = sorted(cox.partition.unique().tolist())

In [16]:
#import ray

@ray.remote
def get_cox_info(p):
    cph = load_pickle(f"{model_path}/{p}")
    p_split = p.split("_")
    endpoint = f"{p_split[0]}_{p_split[1]}"
    score = p_split[2]
    model = p_split[3]+"_"+p_split[4]+"_"+p_split[5]+"_"+p_split[6]+"_"+p_split[7]
    partition = p_split[8].replace(".p", "")
    hrs = cph.hazard_ratios_.to_dict()
    
    if score=="Age+Sex+MedicalHistory+I(Age*MH)":
        hr_ret = hrs[endpoint.replace("-", "")]
        
        key_int_age = [k for k in hrs if "age_at_recruitment_f21022_0_0" in k and endpoint.replace("-", "") in k][0]
        hr_ret_age = hrs[key_int_age]
        
        try:
            key_int_sex = [k for k in hrs if "sex_f31_0_0" in k and endpoint.replace("-", "") in k][0]
            hr_ret_sex = hrs[key_int_sex]
        except:
            hr_ret_sex = np.nan
    else:
        hr_ret = hrs[endpoint] 
        hr_ret_age = np.nan
        hr_ret_sex = np.nan
        
    return {"endpoint": endpoint, 
            "score": score, 
            "model": model,
            "partition": partition, 
            "hrs": hrs, 
            "hrs_ret": hr_ret, 
            "hrs_ret_age": hr_ret_age, 
            "hrs_ret_sex": hr_ret_sex
           }

In [17]:
rows = []

for p in tqdm(cox.path.tolist()):
    rows.append(get_cox_info.remote(p))

  0%|          | 0/22560 [00:00<?, ?it/s]

In [18]:
rows = [ray.get(r) for r in tqdm(rows)]

  0%|          | 0/22560 [00:00<?, ?it/s]

In [19]:
rows[10]

{'endpoint': 'OMOP_4306655',
 'score': 'Age+Sex+Retina',
 'model': 'ImageTraining_[]_ConvNeXt_MLPHead_predictions',
 'partition': '10',
 'hrs': {'age': 1.2767926173525,
  'sex_Male': 1.490091219292325,
  'OMOP_4306655': 3.328164413075899},
 'hrs_ret': 3.328164413075899,
 'hrs_ret_age': nan,
 'hrs_ret_sex': nan}

In [20]:
hrs_endpoints = pd.DataFrame({}).append(rows, ignore_index=True)

In [21]:
hrs_endpoints 

Unnamed: 0,endpoint,score,model,partition,hrs,hrs_ret,hrs_ret_age,hrs_ret_sex
0,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,4,"{'age': 1.3743899847208205, 'sex_Male': 1.2559...",2.689602,,
1,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,5,"{'age': 1.4148546850313075, 'sex_Male': 1.2714...",2.704538,,
2,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,7,"{'age': 1.5515199437202438, 'sex_Male': 1.2752...",2.141984,,
3,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,9,"{'age': 1.408388416455127, 'sex_Male': 1.23747...",2.777396,,
4,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,10,"{'age': 1.3331989378780975, 'sex_Male': 1.2634...",2.966433,,
...,...,...,...,...,...,...,...,...
22555,phecode_997,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,5,"{'age': 1.1887971840256635, 'sex_Male': 0.6322...",3.853138,,
22556,phecode_997,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,7,"{'age': 1.3487486636709136, 'sex_Male': 0.6066...",3.471968,,
22557,phecode_997,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,9,"{'age': 1.2147138678134335, 'sex_Male': 0.6011...",3.948018,,
22558,phecode_997,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_predictions,10,"{'age': 1.1388850124469634, 'sex_Male': 0.6427...",4.313317,,


In [22]:
name = f"hrs_endpoints"
hrs_endpoints.to_feather(f"{experiment_path}/{name}.feather")

In [22]:
cph.plot()

NameError: name 'cph' is not defined

In [23]:
#[[]]

In [24]:
cph.print_summary()

NameError: name 'cph' is not defined