# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_medical_records"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

experiment = 220413
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

/sc-projects/sc-proj-ukb-cvd


In [3]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").sort_values("endpoint")

In [4]:
#endpoints = [e[:-6] for e in data_outcomes.columns if "_event" in e]
endpoints = [
    "phecode_008",
    "phecode_092-2",
    "phecode_105",
    "phecode_107-2",
    "phecode_164",
    "phecode_202-2",
    "phecode_284",
    "phecode_292",
    "phecode_324-11",
    "phecode_328",
    "phecode_371",
    "phecode_401",
    "phecode_404",
    "phecode_424",
    "phecode_440-11",
    "phecode_468",
    "phecode_474",
    "phecode_522-1",
    "phecode_542-1",
    "phecode_581-1",
    "phecode_583",
    "phecode_665",
    "phecode_705-1",
    "OMOP_4306655"  
]

In [5]:
data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_220412.feather").set_index("eid")
data_outcomes = data_outcomes[[c for c in data_outcomes.columns if "_event" in c and c[:-6] in endpoints]]

In [6]:
data_shap = pd.read_feather(f"{experiment_path}/shap_local.feather")
records = [r for r in data_shap.record if 
           "age_at_recruitment_f21022_0_0" not in r and
           "sex_f31_0_0_Female" not in r and
           "sex_f31_0_0_Male" not in r and
           "phecode_" not in r]

In [7]:
data_records = pd.read_feather(f"{output_path}/baseline_records_220412.feather", columns=["eid"] + records).set_index("eid")

In [8]:
data_all = data_records.merge(data_outcomes, left_index=True, right_index=True, how="left")

In [9]:
data_all

Unnamed: 0_level_0,OMOP_1000560,OMOP_1000632,OMOP_1000772,OMOP_1000995,OMOP_1036157,OMOP_1036228,OMOP_1036252,OMOP_1036487,OMOP_1036525,OMOP_1036690,...,phecode_424_event,phecode_328_event,phecode_468_event,phecode_440-11_event,phecode_284_event,phecode_008_event,phecode_092-2_event,phecode_581-1_event,phecode_292_event,phecode_542-1_event
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000018,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1000020,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1000037,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1000043,False,True,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1000051,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025150,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6025165,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6025173,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6025182,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_220414.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [11]:
record_freqs = data_records.mean().sort_values(ascending=False)
record_freqs

OMOP_4081598     0.615561
OMOP_4052351     0.540684
OMOP_4061103     0.527156
OMOP_4144272     0.493337
OMOP_4057411     0.443337
                   ...   
OMOP_40482194    0.000050
OMOP_4113107     0.000050
OMOP_4122590     0.000050
OMOP_434951      0.000050
OMOP_4306655     0.000014
Length: 19366, dtype: float64

In [12]:
from nancorrmp.nancorrmp import NaNCorrMp
corr_df = NaNCorrMp.calculate(data_all, n_jobs=32, chunks=1000)

In [16]:
corr_df.reset_index().to_feather(f"{experiment_path}/corr.feather")

In [17]:
f"{experiment_path}/corr.feather"

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_medical_records/data/220413/corr.feather'

In [15]:
corr_df.reset_index()

Unnamed: 0,index,OMOP_1000560,OMOP_1000632,OMOP_1000772,OMOP_1000995,OMOP_1036157,OMOP_1036228,OMOP_1036252,OMOP_1036487,OMOP_1036525,...,phecode_424_event,phecode_328_event,phecode_468_event,phecode_440-11_event,phecode_284_event,phecode_008_event,phecode_092-2_event,phecode_581-1_event,phecode_292_event,phecode_542-1_event
0,OMOP_1000560,1.000000,0.012527,0.041327,0.002898,-0.000337,0.009051,0.003080,0.003080,0.001118,...,0.008210,-0.000946,0.007735,0.008020,0.003997,0.000312,0.005506,0.001004,0.008724,0.004405
1,OMOP_1000632,0.012527,1.000000,0.006785,0.082518,0.006415,0.020247,0.047821,0.047821,0.040106,...,0.015130,0.007260,0.025319,0.012269,0.023195,0.020236,0.017388,0.023034,0.011331,0.008986
2,OMOP_1000772,0.041327,0.006785,1.000000,0.002274,-0.000129,0.016549,0.004912,0.004912,0.002850,...,0.005051,0.000623,0.004809,0.001149,0.001460,-0.000711,0.001669,-0.000599,-0.000320,-0.000418
3,OMOP_1000995,0.002898,0.082518,0.002274,1.000000,0.030364,0.006922,0.004444,0.004444,0.473181,...,0.012438,0.009589,0.014531,0.009313,0.010525,0.008431,0.011747,0.003861,0.004813,0.011583
4,OMOP_1036157,-0.000337,0.006415,-0.000129,0.030364,1.000000,-0.000417,-0.000662,-0.000662,0.014621,...,0.004062,0.003028,0.003788,0.001920,0.001131,-0.001248,0.002815,0.000851,-0.000562,-0.000734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19385,phecode_008_event,0.000312,0.020236,-0.000711,0.008431,-0.001248,0.001209,0.002978,0.002978,0.004396,...,0.005268,0.000928,0.006215,0.002399,0.007894,1.000000,0.006139,0.003966,0.007289,0.003919
19386,phecode_092-2_event,0.005506,0.017388,0.001669,0.011747,0.002815,0.005088,-0.000266,-0.000266,0.003981,...,0.049065,0.048720,0.116693,0.039307,0.015652,0.006139,1.000000,0.046090,0.002185,0.022959
19387,phecode_581-1_event,0.001004,0.023034,-0.000599,0.003861,0.000851,0.001180,0.008042,0.008042,0.001198,...,0.008335,0.002701,0.018008,0.005484,0.006753,0.003966,0.046090,1.000000,-0.001068,0.008971
19388,phecode_292_event,0.008724,0.011331,-0.000320,0.004813,-0.000562,0.002832,0.002013,0.002013,0.000627,...,0.003164,0.003569,0.007293,0.001656,0.023002,0.007289,0.002185,-0.001068,1.000000,0.008063


## PCA

In [None]:
import numpy as np
from sklearn.decomposition import PCA

In [None]:
pca_data = data_all[c for c in data_all.columns if "_event" not in c]

In [None]:
pca = PCA(n_components=1.)
pca.fit(pca_data)

In [12]:
import ray

ray.init(num_cpus=24, include_dashboard=False)#dashboard_port=24763, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))

{'node_ip_address': '10.32.105.13',
 'raylet_ip_address': '10.32.105.13',
 'redis_address': None,
 'object_store_address': '/tmp/ray/session_2022-04-28_15-13-46_926610_2579164/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-04-28_15-13-46_926610_2579164/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-04-28_15-13-46_926610_2579164',
 'metrics_export_port': 64722,
 'gcs_address': '10.32.105.13:51011',
 'address': '10.32.105.13:51011',
 'node_id': '3efb0ed08865c18e166779f909341456e589ddd14cf7771c6aa881c5'}

In [None]:
corr_df = data_all.corr(method="spearman")

In [13]:
@ray.remote
def calc_cor(data_all, eids_dict, record, eids_record, eids_nonrecord, endpoints):
    r_ds = []
    
    for endpoint in endpoints:
        eids_endpoint = eids_dict[endpoint]
        
        # record set
        eid_idxs_dict = {}
        eid_idxs_dict["record"] = np.where(np.in1d(eids_endpoint, eids_record, assume_unique=True))[0]
        eid_idxs_dict["nonrecord"] = np.where(np.in1d(eids_endpoint, eids_nonrecord, assume_unique=True))[0]

        for key, eid_idxs in eid_idxs_dict.items():
            eids_temp = eids_endpoint[eid_idxs]
            s = data_all[f"{endpoint}_event"].loc[eids_temp]
            n=s.sum()
            freq = n/len(s)
            
            if key=="record":
                s_record
                n_record = n
                freq_record = freq
                
            if key=="nonrecord":
                s_nonrecord = s
                n_nonrecord = n
                freq_nonrecord = freq
        
        jaccard = n_record / (n_nonrecord + len(eid_idxs_dict["record"])) 
        
        r_ds.append({"endpoint": endpoint, "n_eligable": len(eids_dict[endpoint]), 
                  "record": record, "n_records": len(eids_record), 
                  "n_events_record": n_record, "freq_events_record": freq_record,
                    "n_events_nonrecord": n_nonrecord, "freq_events_nonrecord": freq_nonrecord,
                    "jaccard": jaccard})
    return r_ds

In [14]:
d_nested = []
ref_data_all = ray.put(data_all)
ref_eids_dict = ray.put(eids_dict)
for record in tqdm(record_freqs.index):
    s_record = data_all[record]
    s_record = data_all[record]
    eids_record = s_record[s_record==True].index.values
    eids_nonrecord = s_record[s_record==False].index.values
    ref_results = calc_ratio.remote(ref_data_all, ref_eids_dict, record, eids_record, eids_nonrecord, endpoints)
    d_nested.append(ref_results)
d_nested = [ray.get(e) for e in tqdm(d_nested)]
del ref_data_all
del ref_eids_dict

  0%|          | 0/19366 [00:00<?, ?it/s]



  0%|          | 0/19366 [00:00<?, ?it/s]



In [15]:
from itertools import chain

d = list(chain(*d_nested))

In [16]:
endpoints_freqs = pd.DataFrame().from_dict(d)

In [17]:
endpoints_freqs.to_feather(f"{experiment_path}/record_inc_disease_freq_rnr.feather")

In [35]:
endpoints_ds = endpoints_freqs.query("n>100").sort_values("endpoint").reset_index(drop=True)

In [36]:
endpoints_ds.to_csv(f"{experiment_path}/endpoints.csv")

In [37]:
f"{experiment_path}/endpoints.csv"

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_medical_records/data/220413/endpoints.csv'