# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [48]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

##### BEGIN ADAPT #####
# second best model
# wandb_name = 'aug++_convnext_s_mlp'
# wandb_id = '8ngm6apd'
# best model
wandb_name = 'aug++_convnext_s_mlp+'
wandb_id = '3p3smraz'
partitions = [0] # [i for i in range(22)]
##### END   ADAPT #####

experiment = wandb_id
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

/sc-projects/sc-proj-ukb-cvd


In [27]:
import pandas as pd
all_endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_medical_records/data/220413/frequent_endpoints.csv').endpoint.values)])

#all_endpoints = sorted(endpoints_all_md.endpoint.to_list())
print(len(all_endpoints))

endpoints_not_overlapping_with_preds = []
#endpoints_not_overlapping_with_preds_md = pd.read_csv(f"{experiment_path}/endpoints_not_overlapping.csv", header=None)
#print(len(endpoints_not_overlapping_with_preds_md))
#endpoints_not_overlapping_with_preds = list(endpoints_not_overlapping_with_preds_md[0].values)

endpoints = []
for c in all_endpoints:
    if c not in endpoints_not_overlapping_with_preds: # this is what i want
        #print('OK    - ',c)
        endpoints.append(c)
    #if c in endpoints_not_overlapping_with_preds: # this is what causes errors!
    #    print('ERROR - ',c)
print(len(endpoints))

498
498


In [28]:
splits = ["train", "valid", 'test'] # "test_left", 'test_right']

In [29]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").query("endpoint==@endpoints").sort_values("endpoint").set_index("endpoint")

In [30]:
from datetime import date
today = str(date.today())

In [31]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [32]:
%env MKL_NUM_THREADS=4
%env NUMEXPR_NUM_THREADS=4
%env OMP_NUM_THREADS=4

env: MKL_NUM_THREADS=4
env: NUMEXPR_NUM_THREADS=4
env: OMP_NUM_THREADS=4


In [33]:
ray.shutdown()

In [34]:
import ray

ray.init(num_cpus=24)#, dashboard_port=24762, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))

{'node_ip_address': '10.32.105.6',
 'raylet_ip_address': '10.32.105.6',
 'redis_address': '10.32.105.6:18860',
 'object_store_address': '/tmp/ray/session_2022-05-25_13-50-42_158303_2615733/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-05-25_13-50-42_158303_2615733/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-05-25_13-50-42_158303_2615733',
 'metrics_export_port': 34163,
 'node_id': '88c696479797462c0d87258b54e0aa2f975eda1899cef47197e51d2d'}

# Train COX

In [35]:
in_path = pathlib.Path(f"{experiment_path}/coxph/input")
model_path = f"{experiment_path}/coxph/models"

out_path = f"{experiment_path}/coxph/predictions"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [36]:
import pickle
import zstandard

def load_pickle(fp):
    with open(fp, "rb") as fh:
        dctx = zstandard.ZstdDecompressor()
        with dctx.stream_reader(fh) as decompressor:
            data = pickle.loads(decompressor.read())
    return data

In [37]:
cox_paths = !ls $model_path
cox_paths = [p for p in cox_paths if "_Retina" in p or "+Retina" in p or "I(" in p]
cox = pd.Series(cox_paths).str.split("_", expand=True)\
    .assign(path = cox_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(partition = lambda x: x[3].str.replace(".p", "", regex=True).astype(int))\
    [["endpoint", "score", "partition", "path"]].sort_values(["endpoint", "score", "partition"])\
    .query("endpoint ==@ endpoints")\
    .reset_index(drop=True)
cox

Unnamed: 0,endpoint,score,partition,path
0,OMOP_4306655,Age+Sex+Retina,0,OMOP_4306655_Age+Sex+Retina_0.p
1,OMOP_4306655,Retina,0,OMOP_4306655_Retina_0.p
2,phecode_052,Age+Sex+Retina,0,phecode_052_Age+Sex+Retina_0.p
3,phecode_052,Retina,0,phecode_052_Retina_0.p
4,phecode_052-1,Age+Sex+Retina,0,phecode_052-1_Age+Sex+Retina_0.p
...,...,...,...,...
991,phecode_977-52,Retina,0,phecode_977-52_Retina_0.p
992,phecode_977-7,Age+Sex+Retina,0,phecode_977-7_Age+Sex+Retina_0.p
993,phecode_977-7,Retina,0,phecode_977-7_Retina_0.p
994,phecode_977-72,Age+Sex+Retina,0,phecode_977-72_Age+Sex+Retina_0.p


In [38]:
#endpoints = sorted(cox.endpoint.unique().tolist())
scores = sorted(cox.score.unique().tolist())
partitions = sorted(cox.partition.unique().tolist())

In [39]:
#import ray

@ray.remote
def get_cox_info(p):
    cph = load_pickle(f"{model_path}/{p}")
    p_split = p.split("_")
    endpoint = f"{p_split[0]}_{p_split[1]}"
    score = p_split[2]
    partition = p_split[3][:-2]
    hrs = cph.hazard_ratios_.to_dict()
    
    if score=="Age+Sex+MedicalHistory+I(Age*MH)":
        hr_ret = hrs[endpoint.replace("-", "")]
        
        key_int_age = [k for k in hrs if "age_at_recruitment_f21022_0_0" in k and endpoint.replace("-", "") in k][0]
        hr_ret_age = hrs[key_int_age]
        
        try:
            key_int_sex = [k for k in hrs if "sex_f31_0_0" in k and endpoint.replace("-", "") in k][0]
            hr_ret_sex = hrs[key_int_sex]
        except:
            hr_ret_sex = np.nan
    else:
        hr_ret = hrs[endpoint] 
        hr_ret_age = np.nan
        hr_ret_sex = np.nan
        
    return {"endpoint": endpoint, "score": score, "partition": partition, "hrs": hrs, 
            "hrs_ret": hr_ret, 
            "hrs_ret_age": hr_ret_age, 
            "hrs_ret_sex": hr_ret_sex
           }

In [40]:
rows = []

for p in tqdm(cox.path.tolist()):
    rows.append(get_cox_info.remote(p))

  0%|          | 0/996 [00:00<?, ?it/s]

In [41]:
rows = [ray.get(r) for r in tqdm(rows)]

  0%|          | 0/996 [00:00<?, ?it/s]

In [42]:
rows[10]

{'endpoint': 'phecode_056',
 'score': 'Age+Sex+Retina',
 'partition': '0',
 'hrs': {'age_at_recruitment_f21022_0_0': 0.9808010715409536,
  'sex_f31_0_0': 0.952271183601438,
  'phecode_056': 1.0233563959046248},
 'hrs_ret': 1.0233563959046248,
 'hrs_ret_age': nan,
 'hrs_ret_sex': nan}

In [43]:
hrs_endpoints = pd.DataFrame({}).append(rows, ignore_index=True)

In [44]:
hrs_endpoints 

Unnamed: 0,endpoint,score,partition,hrs,hrs_ret,hrs_ret_age,hrs_ret_sex
0,OMOP_4306655,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 1.6914116214...,1.602345,,
1,OMOP_4306655,Retina,0,{'OMOP_4306655': 2.336103909521066},2.336104,,
2,phecode_052,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 1.0976916029...,1.091788,,
3,phecode_052,Retina,0,{'phecode_052': 1.1146724721387116},1.114672,,
4,phecode_052-1,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 0.8123810820...,1.056389,,
...,...,...,...,...,...,...,...
991,phecode_977-52,Retina,0,{'phecode_977-52': 0.5545877399512523},0.554588,,
992,phecode_977-7,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 0.8336892946...,2.149991,,
993,phecode_977-7,Retina,0,{'phecode_977-7': 2.0171895465419705},2.017190,,
994,phecode_977-72,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 1.3521349906...,0.879994,,


In [45]:
name = f"hrs_endpoints"
hrs_endpoints.to_feather(f"{experiment_path}/{name}.feather")

In [46]:
hrs_endpoints

Unnamed: 0,endpoint,score,partition,hrs,hrs_ret,hrs_ret_age,hrs_ret_sex
0,OMOP_4306655,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 1.6914116214...,1.602345,,
1,OMOP_4306655,Retina,0,{'OMOP_4306655': 2.336103909521066},2.336104,,
2,phecode_052,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 1.0976916029...,1.091788,,
3,phecode_052,Retina,0,{'phecode_052': 1.1146724721387116},1.114672,,
4,phecode_052-1,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 0.8123810820...,1.056389,,
...,...,...,...,...,...,...,...
991,phecode_977-52,Retina,0,{'phecode_977-52': 0.5545877399512523},0.554588,,
992,phecode_977-7,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 0.8336892946...,2.149991,,
993,phecode_977-7,Retina,0,{'phecode_977-7': 2.0171895465419705},2.017190,,
994,phecode_977-72,Age+Sex+Retina,0,{'age_at_recruitment_f21022_0_0': 1.3521349906...,0.879994,,


In [47]:
cph.plot()

NameError: name 'cph' is not defined

In [None]:
#[[]]

In [None]:
cph.print_summary()