# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '220812_test'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [3]:
today = '220824'

In [4]:
splits = ["train", "valid", 'test'] # "test_left", 'test_right']

In [5]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

In [6]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").query("endpoint==@endpoints").sort_values("endpoint").set_index("endpoint")

In [7]:
from datetime import date
today = str(date.today()) if today is None else today


In [8]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [9]:
%env MKL_NUM_THREADS=4
%env NUMEXPR_NUM_THREADS=4
%env OMP_NUM_THREADS=4

env: MKL_NUM_THREADS=4
env: NUMEXPR_NUM_THREADS=4
env: OMP_NUM_THREADS=4


In [10]:
#ray.shutdown()

In [11]:
import ray

ray.init(address="auto")#, dashboard_port=24762, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))

RayContext(dashboard_url='', python_version='3.9.7', ray_version='1.13.0', ray_commit='e4ce38d001dbbe09cd21c497fedd03d692b2be3e', address_info={'node_ip_address': '10.32.105.1', 'raylet_ip_address': '10.32.105.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-09-20_10-24-44_275985_2461381/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-09-20_10-24-44_275985_2461381/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-09-20_10-24-44_275985_2461381', 'metrics_export_port': 49547, 'gcs_address': '10.32.105.1:6321', 'address': '10.32.105.1:6321', 'node_id': '7367de6e34a8d76cf7d2286667218e92bcb3c98107ce385a1de9a7c5'})

# Train COX

In [12]:
in_path = pathlib.Path(f"{experiment_path}/coxph/input")
model_path = f"{experiment_path}/coxph/models"

out_path = f"{experiment_path}/coxph/predictions"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [13]:
import pickle
import zstandard

def load_pickle(fp):
    with open(fp, "rb") as fh:
        dctx = zstandard.ZstdDecompressor()
        with dctx.stream_reader(fh) as decompressor:
            data = pickle.loads(decompressor.read())
    return data

In [14]:
cox_paths = !ls $model_path
cox_paths = [p for p in cox_paths if "_Retina" in p or "+Retina" in p or "I(" in p]
cox = pd.Series(cox_paths).str.split("_", expand=True)\
    .assign(path = cox_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(model = lambda x: x[3]+"_"+x[4]+"_"+x[5]+"_"+x[6]+"_"+x[8])\
    .assign(partition = lambda x: x[9].str.replace(".p", "", regex=True).astype(int))\
    [["endpoint", "score", "model", "partition", "path"]].sort_values(["endpoint", "score", "partition"])\
    .query("endpoint ==@ endpoints")\
    .reset_index(drop=True)
cox

Unnamed: 0,endpoint,score,model,partition,path
0,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,0,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
1,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,1,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
2,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,2,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
3,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,3,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
4,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,4,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
...,...,...,...,...,...
128805,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,17,phecode_979_SCORE2+Retina_ImageTraining_[]_Con...
128806,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,18,phecode_979_SCORE2+Retina_ImageTraining_[]_Con...
128807,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,19,phecode_979_SCORE2+Retina_ImageTraining_[]_Con...
128808,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,20,phecode_979_SCORE2+Retina_ImageTraining_[]_Con...


In [15]:
#endpoints = sorted(cox.endpoint.unique().tolist())
scores = sorted(cox.score.unique().tolist())
partitions = sorted(cox.partition.unique().tolist())

In [16]:
#import ray

@ray.remote
def get_cox_info(p):
    cph = load_pickle(f"{model_path}/{p}")
    p_split = p.split("_")
    endpoint = f"{p_split[0]}_{p_split[1]}"
    score = p_split[2]
    model = p_split[3]+"_"+p_split[4]+"_"+p_split[5]+"_"+p_split[6]+"_"+p_split[8]
    partition = p_split[9].replace(".p", "")
    hrs = cph.hazard_ratios_.to_dict()
    
    if score=="Age+Sex+MedicalHistory+I(Age*MH)":
        hr_ret = hrs[endpoint.replace("-", "")]
        
        key_int_age = [k for k in hrs if "age_at_recruitment_f21022_0_0" in k and endpoint.replace("-", "") in k][0]
        hr_ret_age = hrs[key_int_age]
        
        try:
            key_int_sex = [k for k in hrs if "sex_f31_0_0" in k and endpoint.replace("-", "") in k][0]
            hr_ret_sex = hrs[key_int_sex]
        except:
            hr_ret_sex = np.nan
    else:
        hr_ret = hrs[endpoint] 
        hr_ret_age = np.nan
        hr_ret_sex = np.nan
        
    return {"endpoint": endpoint, 
            "score": score, 
            "model": model,
            "partition": partition, 
            "hrs": hrs, 
            "hrs_ret": hr_ret, 
            "hrs_ret_age": hr_ret_age, 
            "hrs_ret_sex": hr_ret_sex
           }

In [17]:
rows = []

for p in tqdm(cox.path.tolist()):
    rows.append(get_cox_info.remote(p))

  0%|          | 0/128810 [00:00<?, ?it/s]

In [18]:
rows = [ray.get(r) for r in tqdm(rows)]

  0%|          | 0/128810 [00:00<?, ?it/s]

[2m[33m(raylet)[0m [2022-09-20 10:26:34,994 E 2461447 2461447] (raylet) worker_pool.cc:502: Some workers of the worker process(2462164) have not registered within the timeout. The process is still alive, probably it's hanging during start.
[2m[33m(raylet)[0m [2022-09-20 10:26:34,996 E 2461447 2461447] (raylet) worker_pool.cc:502: Some workers of the worker process(2462165) have not registered within the timeout. The process is still alive, probably it's hanging during start.
[2m[33m(raylet)[0m [2022-09-20 10:26:34,999 E 2461447 2461447] (raylet) worker_pool.cc:502: Some workers of the worker process(2462166) have not registered within the timeout. The process is still alive, probably it's hanging during start.
[2m[33m(raylet)[0m [2022-09-20 10:26:35,001 E 2461447 2461447] (raylet) worker_pool.cc:502: Some workers of the worker process(2462169) have not registered within the timeout. The process is still alive, probably it's hanging during start.
[2m[33m(raylet)[0m [2022-

In [19]:
rows[10]

{'endpoint': 'OMOP_4306655',
 'score': 'ASCVD+Retina',
 'model': 'ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66',
 'partition': '10',
 'hrs': {'age': 1.1502596901660636,
  'sex_Male': 1.104178199658831,
  'ethnic_background_Asian': 0.829136397112348,
  'ethnic_background_Chinese': 0.7750242851799667,
  'ethnic_background_Black': 0.9574486801157565,
  'ethnic_background_Mixed': 0.8462106052829345,
  'smoking_status_Previous': 1.1798495241491116,
  'smoking_status_Current': 1.8732631188970352,
  'diabetes': 0.7448466395959744,
  'antihypertensives': 1.0301843888885007,
  'systolic_blood_pressure': 1.0297199771770786,
  'cholesterol': 0.9092578622827848,
  'hdl_cholesterol': 0.9682630302304317,
  'OMOP_4306655': 3.554118116223837},
 'hrs_ret': 3.554118116223837,
 'hrs_ret_age': nan,
 'hrs_ret_sex': nan}

In [20]:
hrs_endpoints = pd.DataFrame({}).append(rows, ignore_index=True)

In [21]:
hrs_endpoints 

Unnamed: 0,endpoint,score,model,partition,hrs,hrs_ret,hrs_ret_age,hrs_ret_sex
0,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,0,"{'age': 1.1527501934027027, 'sex_Male': 1.1052...",3.018703,,
1,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,1,"{'age': 1.2049262438451085, 'sex_Male': 1.0758...",3.200564,,
2,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,2,"{'age': 1.1716578072816477, 'sex_Male': 1.1377...",2.949344,,
3,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,3,"{'age': 1.1387289772428877, 'sex_Male': 1.0844...",3.553097,,
4,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,4,"{'age': 1.2175686789659195, 'sex_Male': 1.1328...",3.180296,,
...,...,...,...,...,...,...,...,...
128805,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,17,"{'age': 1.1752389202653595, 'sex_Male': 0.0517...",0.008859,,
128806,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,18,"{'age': 0.6773848730115812, 'sex_Male': 1.3933...",3.111668,,
128807,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,19,"{'age': 4.2836864937835575, 'sex_Male': 1.3976...",0.000026,,
128808,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,20,"{'age': 0.6542848174750082, 'sex_Male': 1.5836...",3.337431,,


In [22]:
name = f"hrs_endpoints"
hrs_endpoints.to_feather(f"{experiment_path}/{name}.feather")

In [23]:
hrs_endpoints

Unnamed: 0,endpoint,score,model,partition,hrs,hrs_ret,hrs_ret_age,hrs_ret_sex
0,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,0,"{'age': 1.1527501934027027, 'sex_Male': 1.1052...",3.018703,,
1,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,1,"{'age': 1.2049262438451085, 'sex_Male': 1.0758...",3.200564,,
2,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,2,"{'age': 1.1716578072816477, 'sex_Male': 1.1377...",2.949344,,
3,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,3,"{'age': 1.1387289772428877, 'sex_Male': 1.0844...",3.553097,,
4,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,4,"{'age': 1.2175686789659195, 'sex_Male': 1.1328...",3.180296,,
...,...,...,...,...,...,...,...,...
128805,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,17,"{'age': 1.1752389202653595, 'sex_Male': 0.0517...",0.008859,,
128806,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,18,"{'age': 0.6773848730115812, 'sex_Male': 1.3933...",3.111668,,
128807,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,19,"{'age': 4.2836864937835575, 'sex_Male': 1.3976...",0.000026,,
128808,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.66,20,"{'age': 0.6542848174750082, 'sex_Male': 1.5836...",3.337431,,


In [24]:
cph.plot()

NameError: name 'cph' is not defined

In [None]:
#[[]]

In [None]:
cph.print_summary()