# Benchmarks

## Initialize

In [1]:
%load_ext autoreload
%autoreload 2

import os
from tqdm.auto import tqdm
import pathlib

import numpy as np
import pandas as pd
import lifelines

In [2]:
%env MKL_NUM_THREADS=1
%env NUMEXPR_NUM_THREADS=1
%env OMP_NUM_THREADS=1

env: MKL_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1


In [3]:
ray.shutdown()

NameError: name 'ray' is not defined

In [4]:
import ray
ray.init(num_cpus=24)#, webui_url="0.0.0.0")

{'node_ip_address': '10.32.105.6',
 'raylet_ip_address': '10.32.105.6',
 'redis_address': '10.32.105.6:65170',
 'object_store_address': '/tmp/ray/session_2022-05-25_11-43-02_882196_3787670/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-05-25_11-43-02_882196_3787670/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-05-25_11-43-02_882196_3787670',
 'metrics_export_port': 57067,
 'node_id': 'c2440f6ee9c44c208b27600c5d0b610cc1638d4777424ddf9defa032'}

In [5]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

##### BEGIN ADAPT #####
# second best model
# wandb_name = 'aug++_convnext_s_mlp'
# wandb_id = '8ngm6apd'
# best model
wandb_name = 'aug++_convnext_s_mlp+'
wandb_id = '3p3smraz'
partitions = [0] # [i for i in range(22)]
##### END   ADAPT #####

experiment = wandb_id
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

/sc-projects/sc-proj-ukb-cvd


In [6]:
in_path = f"{experiment_path}/coxph/predictions"

In [7]:
prediction_paths = !ls $in_path
predictions = pd.Series(prediction_paths).str.split("_", expand=True)\
    .assign(path = prediction_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(partition = lambda x: x[3].str.replace(".feather", "", regex=True).astype(int))\
    [["endpoint", "score", "partition", "path"]].sort_values(["endpoint", "score", "partition"]).reset_index(drop=True)
predictions

Unnamed: 0,endpoint,score,partition,path
0,OMOP_4306655,Age+Sex,0,OMOP_4306655_Age+Sex_0.feather
1,OMOP_4306655,Age+Sex+Retina,0,OMOP_4306655_Age+Sex+Retina_0.feather
2,OMOP_4306655,Retina,0,OMOP_4306655_Retina_0.feather
3,phecode_052,Age+Sex,0,phecode_052_Age+Sex_0.feather
4,phecode_052,Age+Sex+Retina,0,phecode_052_Age+Sex+Retina_0.feather
...,...,...,...,...
1489,phecode_977-7,Age+Sex+Retina,0,phecode_977-7_Age+Sex+Retina_0.feather
1490,phecode_977-7,Retina,0,phecode_977-7_Retina_0.feather
1491,phecode_977-72,Age+Sex,0,phecode_977-72_Age+Sex_0.feather
1492,phecode_977-72,Age+Sex+Retina,0,phecode_977-72_Age+Sex+Retina_0.feather


In [8]:
import pandas as pd
all_endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_medical_records/data/220413/frequent_endpoints.csv').endpoint.values)])

#all_endpoints = sorted(endpoints_all_md.endpoint.to_list())
print(len(all_endpoints))

endpoints_not_overlapping_with_preds = []
#endpoints_not_overlapping_with_preds_md = pd.read_csv(f"{experiment_path}/endpoints_not_overlapping.csv", header=None)
#print(len(endpoints_not_overlapping_with_preds_md))
#endpoints_not_overlapping_with_preds = list(endpoints_not_overlapping_with_preds_md[0].values)

endpoints = []
for c in all_endpoints:
    if c not in endpoints_not_overlapping_with_preds: # this is what i want
        #print('OK    - ',c)
        endpoints.append(c)
    #if c in endpoints_not_overlapping_with_preds: # this is what causes errors!
    #    print('ERROR - ',c)
print(len(endpoints))

498
498


In [9]:
scores = ['Age+Sex', 'Retina', 'Age+Sex+Retina']
partitions = sorted(predictions.partition.unique().tolist())

In [10]:
from datetime import date
today = str(date.today())

In [11]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [12]:
data_outcomes = pd.read_feather(
    f"{output_path}/baseline_outcomes_220412.feather", 
    columns= ["eid"] + [f"{e}_event" for e in endpoints] + [f"{e}_time" for e in endpoints])\
    .set_index("eid")

In [13]:
eids = data_outcomes.index.values

In [14]:
def read_partitions(endpoint, score, time):
    paths = predictions.query("endpoint==@endpoint").query("score==@score").path.to_list()
    data_preds = pd.concat([pd.read_feather(f"{in_path}/{path}", columns=["eid", f"Ft_{time}"]) 
                      for path in paths], axis=0).set_index("eid").sort_index()
    data_preds.columns = ["Ft"]
    return data_preds

In [15]:
def prepare_data(data_outcomes, endpoint, score, t_eval):
    temp_preds = read_partitions(endpoint, score, t_eval)
    temp_tte = data_outcomes[[f"{endpoint}_event", f"{endpoint}_time"]]
    temp_tte.columns = ["event", "time"]
    temp_data = temp_preds.merge(temp_tte, left_index=True, right_index=True, how="left")
    
    condition = (temp_data['event'] == 0) | (temp_data['time'] > t_eval)
    
    temp_data["event"] = (np.where(condition, 0, 1))
    
    temp_data["time"] = (np.where(condition, t_eval, temp_data['time']))
    return temp_data

In [16]:
from lifelines.utils import concordance_index

def calculate_cindex(data_outcomes, endpoint, score, time, iteration, eids_i):  
    temp_data = prepare_data(data_outcomes, endpoint, score, time)
    temp_data = temp_data[temp_data.index.isin(eids_i)]
    
    try:
        cindex = 1-concordance_index(temp_data["time"], temp_data["Ft"], temp_data["event"])
    except ZeroDivisionError: 
        cindex=np.nan
    return {"endpoint":endpoint, "score": score, "iteration": iteration, "time":time, "cindex":cindex}

@ray.remote
def calculate_iteration(data_outcomes, endpoint, score, time, iteration, eids_i):  
    dicts = []
    for score in scores:
        dicts.append(calculate_cindex(data_outcomes, endpoint, score, 10, iteration, eids_i))
    return dicts

In [17]:
iterations=[i for i in range(1000)] # 100

In [18]:
out_path = f"{experiment_path}/benchmarks"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [19]:
ray_outcomes = ray.put(data_outcomes)

rows_ray = []
for endpoint in tqdm(endpoints):
    eids_e = eids_dict[endpoint]
    for iteration in iterations:
        eids_i = np.random.choice(eids_e, size=len(eids_e))
        ds = calculate_iteration.remote(ray_outcomes, endpoint, scores, 10, iteration, eids_i)
        rows_ray.append(ds)

  0%|          | 0/498 [00:00<?, ?it/s]

[2m[36m(calculate_iteration pid=3787841)[0m 
[2m[36m(calculate_iteration pid=3787842)[0m 
[2m[36m(calculate_iteration pid=3787836)[0m 
[2m[36m(calculate_iteration pid=3787846)[0m 
[2m[36m(calculate_iteration pid=3787854)[0m 
[2m[36m(calculate_iteration pid=3787847)[0m 
[2m[36m(calculate_iteration pid=3787845)[0m 
[2m[36m(calculate_iteration pid=3787837)[0m 
[2m[36m(calculate_iteration pid=3787849)[0m 
[2m[36m(calculate_iteration pid=3787855)[0m 
[2m[36m(calculate_iteration pid=3787854)[0m 
[2m[36m(calculate_iteration pid=3787843)[0m 
[2m[36m(calculate_iteration pid=3787844)[0m 
[2m[36m(calculate_iteration pid=3787848)[0m 
[2m[36m(calculate_iteration pid=3787858)[0m 
[2m[36m(calculate_iteration pid=3787856)[0m 
[2m[36m(calculate_iteration pid=3787843)[0m 
[2m[36m(calculate_iteration pid=3787856)[0m 
[2m[36m(calculate_iteration pid=3787849)[0m 
[2m[36m(calculate_iteration pid=3787842)[0m 
[2m[36m(calculate_iteration pid=378785

In [20]:
rows = [ray.get(r) for r in tqdm(rows_ray)]

  0%|          | 0/498000 [00:00<?, ?it/s]

In [21]:
rows_finished = [item for sublist in rows for item in sublist]

In [22]:
benchmark_endpoints = pd.DataFrame({}).append(rows_finished, ignore_index=True)

In [23]:
name = f"benchmark_cindex_agesexcoxph_{today}"
benchmark_endpoints.to_feather(f"{experiment_path}/{name}.feather")

In [24]:
print(f"{experiment_path}/{name}")

/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/3p3smraz/benchmark_cindex_agesexcoxph_2022-05-25


In [25]:
len(rows_finished), len(rows)

(1494000, 498000)

In [26]:
pd.DataFrame({}).append(rows_finished, ignore_index=True).to_feather(f"{out_path}/{endpoint}.feather")

In [None]:
%%time
temp_data = prepare_data(data_outcomes, endpoint, score, 10)