# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas_220603_fullrun"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '220603_fullrun'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
    "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
    "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
    "predictions_cropratio0.8": "ConvNextSmall(Retina)+MLP_cropratio0.8",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [3]:
splits = ["train", "valid", 'test'] # "test_left", 'test_right']

In [4]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

In [5]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").query("endpoint==@endpoints").sort_values("endpoint").set_index("endpoint")

In [6]:
from datetime import date
today = str(date.today())
#today = '2022-07-01'

In [7]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [8]:
%env MKL_NUM_THREADS=4
%env NUMEXPR_NUM_THREADS=4
%env OMP_NUM_THREADS=4

env: MKL_NUM_THREADS=4
env: NUMEXPR_NUM_THREADS=4
env: OMP_NUM_THREADS=4


In [9]:
ray.shutdown()

NameError: name 'ray' is not defined

In [9]:
import ray

ray.init(address="auto")#, dashboard_port=24762, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))

RayContext(dashboard_url='', python_version='3.9.7', ray_version='1.12.1', ray_commit='4863e33856b54ccf8add5cbe75e41558850a1b75', address_info={'node_ip_address': '10.32.105.13', 'raylet_ip_address': '10.32.105.13', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-07-13_17-05-45_491760_236754/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-07-13_17-05-45_491760_236754/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-07-13_17-05-45_491760_236754', 'metrics_export_port': 47551, 'gcs_address': '10.32.105.13:6378', 'address': '10.32.105.13:6378', 'node_id': '3eeabeb6a5246609ce6ec9fcd899d2336a91d1d0765b3b8c35df353a'})

# Train COX

In [10]:
in_path = pathlib.Path(f"{experiment_path}/coxph/input")
model_path = f"{experiment_path}/coxph/models"

out_path = f"{experiment_path}/coxph/predictions"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [11]:
import pickle
import zstandard

def load_pickle(fp):
    with open(fp, "rb") as fh:
        dctx = zstandard.ZstdDecompressor()
        with dctx.stream_reader(fh) as decompressor:
            data = pickle.loads(decompressor.read())
    return data

In [12]:
cox_paths = !ls $model_path
cox_paths = [p for p in cox_paths if "_Retina" in p or "+Retina" in p or "I(" in p]
cox = pd.Series(cox_paths).str.split("_", expand=True)\
    .assign(path = cox_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(model = lambda x: x[3]+"_"+x[4]+"_"+x[5]+"_"+x[6]+"_"+x[8])\
    .assign(partition = lambda x: x[9].str.replace(".p", "", regex=True).astype(int))\
    [["endpoint", "score", "model", "partition", "path"]].sort_values(["endpoint", "score", "partition"])\
    .query("endpoint ==@ endpoints")\
    .reset_index(drop=True)
cox

Unnamed: 0,endpoint,score,model,partition,path
0,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.3,0,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
1,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.5,0,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
2,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.8,0,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
3,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.3,1,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
4,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.5,1,OMOP_4306655_ASCVD+Retina_ImageTraining_[]_Con...
...,...,...,...,...,...
386425,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.5,20,phecode_979_SCORE2+Retina_ImageTraining_[]_Con...
386426,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.8,20,phecode_979_SCORE2+Retina_ImageTraining_[]_Con...
386427,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.3,21,phecode_979_SCORE2+Retina_ImageTraining_[]_Con...
386428,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.5,21,phecode_979_SCORE2+Retina_ImageTraining_[]_Con...


In [13]:
#endpoints = sorted(cox.endpoint.unique().tolist())
scores = sorted(cox.score.unique().tolist())
partitions = sorted(cox.partition.unique().tolist())

In [14]:
#import ray

@ray.remote
def get_cox_info(p):
    cph = load_pickle(f"{model_path}/{p}")
    p_split = p.split("_")
    endpoint = f"{p_split[0]}_{p_split[1]}"
    score = p_split[2]
    model = p_split[3]+"_"+p_split[4]+"_"+p_split[5]+"_"+p_split[6]+"_"+p_split[8]
    partition = p_split[9].replace(".p", "")
    hrs = cph.hazard_ratios_.to_dict()
    
    if score=="Age+Sex+MedicalHistory+I(Age*MH)":
        hr_ret = hrs[endpoint.replace("-", "")]
        
        key_int_age = [k for k in hrs if "age_at_recruitment_f21022_0_0" in k and endpoint.replace("-", "") in k][0]
        hr_ret_age = hrs[key_int_age]
        
        try:
            key_int_sex = [k for k in hrs if "sex_f31_0_0" in k and endpoint.replace("-", "") in k][0]
            hr_ret_sex = hrs[key_int_sex]
        except:
            hr_ret_sex = np.nan
    else:
        hr_ret = hrs[endpoint] 
        hr_ret_age = np.nan
        hr_ret_sex = np.nan
        
    return {"endpoint": endpoint, 
            "score": score, 
            "model": model,
            "partition": partition, 
            "hrs": hrs, 
            "hrs_ret": hr_ret, 
            "hrs_ret_age": hr_ret_age, 
            "hrs_ret_sex": hr_ret_sex
           }

In [15]:
rows = []

for p in tqdm(cox.path.tolist()):
    rows.append(get_cox_info.remote(p))

  0%|          | 0/386430 [00:00<?, ?it/s]

In [16]:
rows = [ray.get(r) for r in tqdm(rows)]

  0%|          | 0/386430 [00:00<?, ?it/s]

In [17]:
rows[10]

{'endpoint': 'OMOP_4306655',
 'score': 'ASCVD+Retina',
 'model': 'ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.5',
 'partition': '3',
 'hrs': {'age': 1.1658720262780802,
  'sex_Male': 1.17834813590126,
  'ethnic_background_Asian': 0.899660103297356,
  'ethnic_background_Chinese': 0.7784202298775144,
  'ethnic_background_Black': 1.1342145252230424,
  'ethnic_background_Mixed': 0.7588221611844511,
  'smoking_status_Previous': 1.2023608899989966,
  'smoking_status_Current': 2.0405203361888113,
  'diabetes': 0.8678770361173397,
  'antihypertensives': 1.110139487563005,
  'systolic_blood_pressure': 1.0457669360442585,
  'cholesterol': 0.922918208411687,
  'hdl_cholesterol': 0.9405610924613153,
  'OMOP_4306655': 3.7906500603892397},
 'hrs_ret': 3.7906500603892397,
 'hrs_ret_age': nan,
 'hrs_ret_sex': nan}

In [18]:
hrs_endpoints = pd.DataFrame({}).append(rows, ignore_index=True)

In [19]:
hrs_endpoints 

Unnamed: 0,endpoint,score,model,partition,hrs,hrs_ret,hrs_ret_age,hrs_ret_sex
0,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.3,0,"{'age': 1.6785519293942757, 'sex_Male': 1.2723...",1.841906,,
1,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.5,0,"{'age': 1.4742082051582992, 'sex_Male': 1.2456...",2.305291,,
2,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.8,0,"{'age': 1.3630921970920196, 'sex_Male': 1.1772...",2.619298,,
3,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.3,1,"{'age': 1.500555052160527, 'sex_Male': 1.26175...",2.247894,,
4,OMOP_4306655,ASCVD+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.5,1,"{'age': 1.2022493243146264, 'sex_Male': 1.1839...",3.231338,,
...,...,...,...,...,...,...,...,...
386425,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.5,20,"{'age': 0.9232972866678388, 'sex_Male': 1.9362...",2.254796,,
386426,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.8,20,"{'age': 0.9228791348222163, 'sex_Male': 1.9614...",2.079002,,
386427,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.3,21,"{'age': 0.920288993569344, 'sex_Male': 1.74713...",3.047668,,
386428,phecode_979,SCORE2+Retina,ImageTraining_[]_ConvNeXt_MLPHead_cropratio0.5,21,"{'age': 0.8972068192409791, 'sex_Male': 1.8803...",3.146781,,


In [20]:
name = f"hrs_endpoints"
hrs_endpoints.to_feather(f"{experiment_path}/{name}.feather")

In [24]:
hrs_endpoints

Unnamed: 0,endpoint,score,partition,hrs,hrs_ret,hrs_ret_age,hrs_ret_sex
0,OMOP_4306655,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 1.8536014768...,1.358667,,
1,OMOP_4306655,Retina,0,{'OMOP_4306655': 1.9896215221014746},1.989622,,
2,phecode_002,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 1.1513855518...,1.369798,,
3,phecode_002,Retina,0,{'phecode_002': 1.5118973422633717},1.511897,,
4,phecode_002-1,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 1.0310421848...,1.366880,,
...,...,...,...,...,...,...,...
2337,phecode_977-71,Retina,0,{'phecode_977-71': 1.3476976946053403},1.347698,,
2338,phecode_977-72,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 0.8926973277...,1.741369,,
2339,phecode_977-72,Retina,0,{'phecode_977-72': 1.6181321563468856},1.618132,,
2340,phecode_979,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 0.9573377834...,1.230307,,


In [47]:
cph.plot()

NameError: name 'cph' is not defined

In [None]:
#[[]]

In [None]:
cph.print_summary()