# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '230905'
experiment_path = f"{output_path}/{experiment}"
print('experiment path:', experiment_path)
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
#     "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
#     "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
#    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
    "predictions": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

#partitions = [i for i in range(22)]
partitions = [4, 5, 7, 9, 10, 20] # Partitions with eye test centers

/sc-projects/sc-proj-ukb-cvd
experiment path: /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230905


In [3]:
today = '230905'

In [4]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv(f'/sc-projects/sc-proj-ukb-cvd/results/projects/{project_label}/data/{today}/endpoints.csv').endpoint.values)])

In [5]:
out_path = f"{experiment_path}/coxph/predictions"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [6]:
from sklearn.preprocessing import StandardScaler
import pickle
import zstandard

def read_data(fp_in):
    temp = pd.read_feather(f"{fp_in}").set_index("eid")
    return temp   
    
def save_pickle(data, data_path):
    with open(data_path, "wb") as fh:
        cctx = zstandard.ZstdCompressor()
        with cctx.stream_writer(fh) as compressor:
            compressor.write(pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL))
    
def read_predictions(endpoint, feature_set, partition, model):
    
    #identifier = f"{endpoint}_{feature_set}_{model}_{partition}_mean" # for meaned preds
    identifier = f"{endpoint}_{feature_set}_{model}_{partition}"
    fp_in = f"{out_path}/{identifier}.feather"
    
    temp = read_data(fp_in)
    return temp

In [7]:
model_path = pathlib.Path(f"{experiment_path}/coxph/input")
models = [f.name for f in model_path.iterdir() if f.is_dir() and "ipynb_checkpoints" not in str(f)]
models

['ImageTraining_[]_ConvNeXt_MLPHead_predictions']

In [8]:
d = []

for endpoint in tqdm(endpoints):
    #print(i)
    for feature_set in [
        "Age+Sex",
        "Retina",
        "Age+Sex+Retina",
         "SCORE2",
         "SCORE2+Retina",
         "ASCVD",
         "ASCVD+Retina",
         "QRISK3",
         "QRISK3+Retina"
                    ]:
        for partition in partitions:
            for model in models:
                try: 
                    temp = read_predictions(endpoint, feature_set, partition, model)
                    d.append({"endpoint": endpoint, "features":feature_set, "model":model, "partition":partition, "available": True})
                except:
                    d.append({"endpoint": endpoint, "features":feature_set, "partition":partition, "available": False})

  0%|          | 0/752 [00:00<?, ?it/s]

In [9]:
pd.DataFrame.from_dict(d)['available'].unique()

array([ True])

In [10]:
pd.DataFrame.from_dict(d).groupby(["features"])["available"].sum().to_frame()

Unnamed: 0_level_0,available
features,Unnamed: 1_level_1
ASCVD,4512
ASCVD+Retina,4512
Age+Sex,4512
Age+Sex+Retina,4512
QRISK3,4512
QRISK3+Retina,4512
Retina,4512
SCORE2,4512
SCORE2+Retina,4512


In [11]:
pd.DataFrame.from_dict(d).groupby(["model", "features"])["available"].sum().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,available
model,features,Unnamed: 2_level_1
ImageTraining_[]_ConvNeXt_MLPHead_predictions,ASCVD,4512
ImageTraining_[]_ConvNeXt_MLPHead_predictions,ASCVD+Retina,4512
ImageTraining_[]_ConvNeXt_MLPHead_predictions,Age+Sex,4512
ImageTraining_[]_ConvNeXt_MLPHead_predictions,Age+Sex+Retina,4512
ImageTraining_[]_ConvNeXt_MLPHead_predictions,QRISK3,4512
ImageTraining_[]_ConvNeXt_MLPHead_predictions,QRISK3+Retina,4512
ImageTraining_[]_ConvNeXt_MLPHead_predictions,Retina,4512
ImageTraining_[]_ConvNeXt_MLPHead_predictions,SCORE2,4512
ImageTraining_[]_ConvNeXt_MLPHead_predictions,SCORE2+Retina,4512


In [12]:
in_path = f"{experiment_path}/coxph/predictions"
prediction_paths = !ls $in_path
print(prediction_paths[0])
predictions = pd.Series(prediction_paths).str.split("_", expand=True)\
    .assign(path = prediction_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(model = lambda x: x[3]+"_"+x[4]+"_"+x[5]+"_"+x[6]+"_"+x[7])\
    .assign(partition = lambda x: x[8].str.replace(".feather", "", regex=True).astype(int))\
    [["model", "endpoint", "score", "partition", "path"]].sort_values(["model", "endpoint", "score", "partition"]).reset_index(drop=True)
predictions

OMOP_4306655_Age+Sex_ImageTraining_[]_ConvNeXt_MLPHead_predictions_10.feather


Unnamed: 0,model,endpoint,score,partition,path
0,ImageTraining_[]_ConvNeXt_MLPHead_predictions,OMOP_4306655,ASCVD,4,OMOP_4306655_ASCVD_ImageTraining_[]_ConvNeXt_M...
1,ImageTraining_[]_ConvNeXt_MLPHead_predictions,OMOP_4306655,ASCVD,5,OMOP_4306655_ASCVD_ImageTraining_[]_ConvNeXt_M...
2,ImageTraining_[]_ConvNeXt_MLPHead_predictions,OMOP_4306655,ASCVD,7,OMOP_4306655_ASCVD_ImageTraining_[]_ConvNeXt_M...
3,ImageTraining_[]_ConvNeXt_MLPHead_predictions,OMOP_4306655,ASCVD,9,OMOP_4306655_ASCVD_ImageTraining_[]_ConvNeXt_M...
4,ImageTraining_[]_ConvNeXt_MLPHead_predictions,OMOP_4306655,ASCVD,10,OMOP_4306655_ASCVD_ImageTraining_[]_ConvNeXt_M...
...,...,...,...,...,...
40603,ImageTraining_[]_ConvNeXt_MLPHead_predictions,phecode_997,SCORE2+Retina,5,phecode_997_SCORE2+Retina_ImageTraining_[]_Con...
40604,ImageTraining_[]_ConvNeXt_MLPHead_predictions,phecode_997,SCORE2+Retina,7,phecode_997_SCORE2+Retina_ImageTraining_[]_Con...
40605,ImageTraining_[]_ConvNeXt_MLPHead_predictions,phecode_997,SCORE2+Retina,9,phecode_997_SCORE2+Retina_ImageTraining_[]_Con...
40606,ImageTraining_[]_ConvNeXt_MLPHead_predictions,phecode_997,SCORE2+Retina,10,phecode_997_SCORE2+Retina_ImageTraining_[]_Con...


In [13]:
predictions.to_feather(f"{experiment_path}/prediction_paths.feather")

In [14]:
experiment_path

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/230426'