# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '221108'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [3]:
today = '221109'

In [4]:
splits = ["train", "valid", 'test'] # "test_left", 'test_right']

In [5]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

In [6]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").query("endpoint==@endpoints").sort_values("endpoint").set_index("endpoint")

In [7]:
from datetime import date
today = str(date.today()) if today is None else today


In [8]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [9]:
%env MKL_NUM_THREADS=4
%env NUMEXPR_NUM_THREADS=4
%env OMP_NUM_THREADS=4

env: MKL_NUM_THREADS=4
env: NUMEXPR_NUM_THREADS=4
env: OMP_NUM_THREADS=4


In [10]:
#ray.shutdown()

In [12]:
import ray

ray.init(address="auto")#, dashboard_port=24762, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))

RayContext(dashboard_url='', python_version='3.9.7', ray_version='1.13.0', ray_commit='e4ce38d001dbbe09cd21c497fedd03d692b2be3e', address_info={'node_ip_address': '10.32.105.13', 'raylet_ip_address': '10.32.105.13', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-11-18_21-48-23_999151_2509896/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-11-18_21-48-23_999151_2509896/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-11-18_21-48-23_999151_2509896', 'metrics_export_port': 44551, 'gcs_address': '10.32.105.13:6321', 'address': '10.32.105.13:6321', 'node_id': 'd351af6132de951635bcaf21cc3b94e74d7e56c7400c4d620212c41c'})

# Train COX

In [13]:
in_path = pathlib.Path(f"{experiment_path}/coxph/input")
model_path = f"{experiment_path}/coxph/models"

out_path = f"{experiment_path}/coxph/predictions"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [14]:
import pickle
import zstandard

def load_pickle(fp):
    with open(fp, "rb") as fh:
        dctx = zstandard.ZstdDecompressor()
        with dctx.stream_reader(fh) as decompressor:
            data = pickle.loads(decompressor.read())
    return data

In [15]:
cox_paths = !ls $model_path
cox_paths = [p for p in cox_paths if "_Retina" in p or "+Retina" in p or "I(" in p]
cox = pd.Series(cox_paths).str.split("_", expand=True)\
    .assign(path = cox_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(model = lambda x: x[3]+"_"+x[4]+"_"+x[5]+"_"+x[6]+"_"+x[8])\
    .assign(partition = lambda x: x[9].str.replace(".p", "", regex=True).astype(int))\
    [["endpoint", "score", "model", "partition", "path"]].sort_values(["endpoint", "score", "partition"])\
    .query("endpoint ==@ endpoints")\
    .reset_index(drop=True)
cox

Unnamed: 0,endpoint,score,model,partition,path
0,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,0,OMOP_4306655_Age+Sex+Retina_ImageTraining_[]_C...
1,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,1,OMOP_4306655_Age+Sex+Retina_ImageTraining_[]_C...
2,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,2,OMOP_4306655_Age+Sex+Retina_ImageTraining_[]_C...
3,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,3,OMOP_4306655_Age+Sex+Retina_ImageTraining_[]_C...
4,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,4,OMOP_4306655_Age+Sex+Retina_ImageTraining_[]_C...
...,...,...,...,...,...
51519,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,17,phecode_979_Retina_ImageTraining_[]_ConvNeXt_M...
51520,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,18,phecode_979_Retina_ImageTraining_[]_ConvNeXt_M...
51521,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,19,phecode_979_Retina_ImageTraining_[]_ConvNeXt_M...
51522,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,20,phecode_979_Retina_ImageTraining_[]_ConvNeXt_M...


In [16]:
#endpoints = sorted(cox.endpoint.unique().tolist())
scores = sorted(cox.score.unique().tolist())
partitions = sorted(cox.partition.unique().tolist())

In [17]:
#import ray

@ray.remote
def get_cox_info(p):
    cph = load_pickle(f"{model_path}/{p}")
    p_split = p.split("_")
    endpoint = f"{p_split[0]}_{p_split[1]}"
    score = p_split[2]
    model = p_split[3]+"_"+p_split[4]+"_"+p_split[5]+"_"+p_split[6]+"_"+p_split[8]
    partition = p_split[9].replace(".p", "")
    hrs = cph.hazard_ratios_.to_dict()
    
    if score=="Age+Sex+MedicalHistory+I(Age*MH)":
        hr_ret = hrs[endpoint.replace("-", "")]
        
        key_int_age = [k for k in hrs if "age_at_recruitment_f21022_0_0" in k and endpoint.replace("-", "") in k][0]
        hr_ret_age = hrs[key_int_age]
        
        try:
            key_int_sex = [k for k in hrs if "sex_f31_0_0" in k and endpoint.replace("-", "") in k][0]
            hr_ret_sex = hrs[key_int_sex]
        except:
            hr_ret_sex = np.nan
    else:
        hr_ret = hrs[endpoint] 
        hr_ret_age = np.nan
        hr_ret_sex = np.nan
        
    return {"endpoint": endpoint, 
            "score": score, 
            "model": model,
            "partition": partition, 
            "hrs": hrs, 
            "hrs_ret": hr_ret, 
            "hrs_ret_age": hr_ret_age, 
            "hrs_ret_sex": hr_ret_sex
           }

In [18]:
rows = []

for p in tqdm(cox.path.tolist()):
    rows.append(get_cox_info.remote(p))

  0%|          | 0/51524 [00:00<?, ?it/s]

In [19]:
rows = [ray.get(r) for r in tqdm(rows)]

  0%|          | 0/51524 [00:00<?, ?it/s]

[2m[33m(raylet)[0m [2022-11-18 21:50:00,583 E 2509968 2509968] (raylet) worker_pool.cc:502: Some workers of the worker process(2510339) have not registered within the timeout. The process is still alive, probably it's hanging during start.
[2m[33m(raylet)[0m [2022-11-18 21:50:00,586 E 2509968 2509968] (raylet) worker_pool.cc:502: Some workers of the worker process(2510341) have not registered within the timeout. The process is still alive, probably it's hanging during start.
[2m[33m(raylet)[0m [2022-11-18 21:50:00,588 E 2509968 2509968] (raylet) worker_pool.cc:502: Some workers of the worker process(2510342) have not registered within the timeout. The process is still alive, probably it's hanging during start.
[2m[33m(raylet)[0m [2022-11-18 21:50:00,591 E 2509968 2509968] (raylet) worker_pool.cc:502: Some workers of the worker process(2510345) have not registered within the timeout. The process is still alive, probably it's hanging during start.
[2m[33m(raylet)[0m [2022-

In [20]:
rows[10]

{'endpoint': 'OMOP_4306655',
 'score': 'Age+Sex+Retina',
 'model': 'ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66',
 'partition': '10',
 'hrs': {'age': 1.1832623910227895,
  'sex_Male': 1.2578132317215376,
  'OMOP_4306655': 3.525375107297933},
 'hrs_ret': 3.525375107297933,
 'hrs_ret_age': nan,
 'hrs_ret_sex': nan}

In [21]:
hrs_endpoints = pd.DataFrame({}).append(rows, ignore_index=True)

In [22]:
hrs_endpoints 

Unnamed: 0,endpoint,score,model,partition,hrs,hrs_ret,hrs_ret_age,hrs_ret_sex
0,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,0,"{'age': 1.1569650058150076, 'sex_Male': 1.2521...",3.067072,,
1,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,1,"{'age': 1.1930293430626582, 'sex_Male': 1.2329...",3.343209,,
2,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,2,"{'age': 1.1675932654272487, 'sex_Male': 1.2889...",3.046918,,
3,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,3,"{'age': 1.1449403577207453, 'sex_Male': 1.2303...",3.465731,,
4,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,4,"{'age': 1.2232534874397427, 'sex_Male': 1.2894...",3.204823,,
...,...,...,...,...,...,...,...,...
51519,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,17,{'phecode_979': 2.9028276127399497},2.902828,,
51520,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,18,{'phecode_979': 2.8184150503434156},2.818415,,
51521,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,19,{'phecode_979': 2.8725367312067025},2.872537,,
51522,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,20,{'phecode_979': 3.0356558826419855},3.035656,,


In [23]:
name = f"hrs_endpoints"
hrs_endpoints.to_feather(f"{experiment_path}/{name}.feather")

In [24]:
hrs_endpoints

Unnamed: 0,endpoint,score,model,partition,hrs,hrs_ret,hrs_ret_age,hrs_ret_sex
0,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,0,"{'age': 1.1569650058150076, 'sex_Male': 1.2521...",3.067072,,
1,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,1,"{'age': 1.1930293430626582, 'sex_Male': 1.2329...",3.343209,,
2,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,2,"{'age': 1.1675932654272487, 'sex_Male': 1.2889...",3.046918,,
3,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,3,"{'age': 1.1449403577207453, 'sex_Male': 1.2303...",3.465731,,
4,OMOP_4306655,Age+Sex+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,4,"{'age': 1.2232534874397427, 'sex_Male': 1.2894...",3.204823,,
...,...,...,...,...,...,...,...,...
51519,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,17,{'phecode_979': 2.9028276127399497},2.902828,,
51520,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,18,{'phecode_979': 2.8184150503434156},2.818415,,
51521,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,19,{'phecode_979': 2.8725367312067025},2.872537,,
51522,phecode_979,Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,20,{'phecode_979': 3.0356558826419855},3.035656,,


In [25]:
cph.plot()

NameError: name 'cph' is not defined

In [None]:
#[[]]

In [None]:
cph.print_summary()