In [1]:
import numpy as np
import pandas as pd
import pathlib
import os
from tqdm.auto import tqdm

import hydra
from omegaconf import DictConfig, OmegaConf

import torch
#from torch_geometric import seed_everything

import ray

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

# experiment = '220812_test'
experiment = '221108'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
#     "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
#     "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

partitions = [i for i in range(22)]

/sc-projects/sc-proj-ukb-cvd


In [3]:
endpoint_columns = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

In [4]:
#ray.shutdown()
#ray.init(num_cpus=24)
# ray.init(address='auto')

In [5]:
import wandb
api = wandb.Api()
entity, project = "cardiors", "retina"  # 'll-cha'
tag = '220812'
runs = api.runs(entity + "/" + project, filters={"tags": {"$in": [tag]}}) 

In [6]:
run_list = []
for run in tqdm(runs): 
    run_list.append(
        {
            #"id": run.path[-1], 
            "id": run.id, 
            "name": run.name,
            "tags": run.tags,
            "partition": eval(run.config['_content']['datamodule'])['partition'],
            "config": {k: v for k,v in run.config.items() if not k.startswith('_')},
            "summary": run.summary._json_dict,
            "path": None if "predictions_path" not in run.config.keys() else str(pathlib.Path(run.config["predictions_path"]))
#             'path': f'/sc-projects/sc-proj-ukb-cvd/results/models/retina/{run.id}/checkpoints/predictions/'
        }
    )

  0%|          | 0/22 [00:00<?, ?it/s]

In [7]:
# select those w/ predictions path:
runs_df = pd.DataFrame(run_list)
runs_df = runs_df.query('partition in @partitions')#, parser='python')

In [8]:
runs_df

Unnamed: 0,id,name,tags,partition,config,summary,path
0,ljhjndx2,220812_fullrun,"[220812, baseline_data, image]",4,{'losses': ['<retinalrisk.models.loss_wrapper....,{'valid/phecode_688-3 - Pyogenic granuloma of ...,/sc-projects/sc-proj-ukb-cvd/results/models/re...
1,1ts15g03,220812_fullrun_9,"[220812, baseline_data, image]",9,{'losses': ['<retinalrisk.models.loss_wrapper....,{'gradients/encoder.features.5.9.block.2.bias'...,/sc-projects/sc-proj-ukb-cvd/results/models/re...
2,jtx4az09,220812_fullrun_15,"[220812, baseline_data, image]",15,{'losses': ['<retinalrisk.models.loss_wrapper....,{'gradients/encoder.features.6.1.bias': {'valu...,/sc-projects/sc-proj-ukb-cvd/results/models/re...
3,10r747tq,220812_fullrun_3,"[220812, baseline_data, image]",3,{'losses': ['<retinalrisk.models.loss_wrapper....,{'valid/phecode_257 - Polydipsia_CIndex': 0.43...,/sc-projects/sc-proj-ukb-cvd/results/models/re...
4,3kkkwx1h,220812_fullrun_8,"[220812, baseline_data, image]",8,{'losses': ['<retinalrisk.models.loss_wrapper....,{'valid/phecode_431-12 - Hemorrhagic stroke_CI...,/sc-projects/sc-proj-ukb-cvd/results/models/re...
5,kr1fcpov,220812_fullrun_7,"[220812, baseline_data, image]",7,{'losses': ['<retinalrisk.models.loss_wrapper....,"{'train/CoxPH_scaled': 4.406993865966797, 'val...",/sc-projects/sc-proj-ukb-cvd/results/models/re...
6,2i64fvxa,220812_fullrun_2,"[220812, baseline_data, image]",2,{'losses': ['<retinalrisk.models.loss_wrapper....,{'gradients/head.layers.6.bias': {'values': [1...,/sc-projects/sc-proj-ukb-cvd/results/models/re...
7,2lsliaj8,220812_fullrun_21,"[220812, baseline_data, image]",21,{'losses': ['<retinalrisk.models.loss_wrapper....,{'valid/phecode_355-1 - Coma_CIndex': 0.519532...,/sc-projects/sc-proj-ukb-cvd/results/models/re...
8,9e6xmhpu,220812_fullrun_14,"[220812, baseline_data, image]",14,{'losses': ['<retinalrisk.models.loss_wrapper....,{'valid/phecode_601-12 - Chronic prostatitis_C...,/sc-projects/sc-proj-ukb-cvd/results/models/re...
9,227i0u0y,220812_fullrun_13,"[220812, baseline_data, image]",13,{'losses': ['<retinalrisk.models.loss_wrapper....,{'gradients/encoder.features.5.3.block.3.bias'...,/sc-projects/sc-proj-ukb-cvd/results/models/re...


In [9]:
print(runs_df['path'].iloc[0])

/sc-projects/sc-proj-ukb-cvd/results/models/retina/ljhjndx2/predictions


## Process Predictions

In [10]:
id_vars = ["eid", "model", "partition", "split"]

In [11]:
out_path = f"{experiment_path}/loghs"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [12]:
out_path

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/loghs'

In [13]:
# @ray.remote
def prepare_predictions(in_path, out_path):
    for cr in name_dict.keys():
        in_path_cr = os.path.join(in_path, f'{cr}.feather')
        temp = pd.read_feather(in_path_cr).rename(columns={"index": "eid"}).set_index('eid')
        
        # skip stuff if already written:
        # mean duplicated indices aka left and right eye value per eid
        temp_with_meaned_test_preds = temp.groupby(level=0).mean()
        
        # recover columns that are non-endpoints:
        other_cols = [c for c in temp.columns.values if c not in endpoint_columns]
        temp_with_meaned_test_preds = temp_with_meaned_test_preds.merge(temp[other_cols][~temp.index.duplicated()], left_index=True, right_index=True, how='left')
   
        temp = temp_with_meaned_test_preds.reset_index(drop=False)

        # rename 10_1_Ft__ and dropping ft and St cols if present
        cols_to_drop = []
        cols_to_rename = {}
        for col in temp.columns.values:
            if 'ft' in col:
                cols_to_drop.append(col)
            elif 'St' in col:
                cols_to_drop.append(col)
            elif 'Ft' in col:
                cols_to_rename[col] = col.replace('1_10_Ft__', '')
                
        temp['record_cols'] = None
        temp["model"] = (temp.module.astype(str) + "_" + temp.covariate_cols.astype(str) + "_" + temp.encoder.astype(str) + "_" + temp["head"].astype(str)).astype("category")
        temp = temp.replace({"model":name_dict}).drop(columns=["module", "encoder", "head", "covariate_cols", "record_cols"]).drop(columns=cols_to_drop).rename(columns=cols_to_rename)
        for c in id_vars: 
            temp[c] = temp[c].astype("category")
            
        model = temp.model.unique()[0]
        model = f'{model}_{cr}'
        partition = temp.partition.unique()[0]
        
#         if os.path.exists(f"{out_path}/{model}/{partition}/train.feather"):
#             if os.path.exists(f"{out_path}/{model}/{partition}/test.feather") and os.path.exists(f"{out_path}/{model}/{partition}/valid.feather"):
#                 print(f'skipping {partition} as already exists')
#                 continue
        
   
        for split in ["train", "valid", "test"]: #"test_left", 'test_right'
            fp_out = f"{out_path}/{model}/{partition}"
            pathlib.Path(fp_out).mkdir(parents=True, exist_ok=True)
            t = temp.query("split==@split")
            t.reset_index(drop=True).to_feather(f"{fp_out}/{split}.feather")
            print(f"{fp_out}/{split}.feather")

In [None]:
for row_idx in tqdm(range(len(runs_df))): 
    row = runs_df.iloc[row_idx]
    p = row['path']
    id = row['id']
#     prepare_predictions.remote(p, out_path)
    prepare_predictions(p, out_path)

  0%|          | 0/22 [00:00<?, ?it/s]

/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66/4/train.feather
/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66/4/valid.feather
/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66/4/test.feather
/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66/9/train.feather
/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66/9/valid.feather
/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66/9/test.feather
/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phew

In [None]:
out_path

In [None]:
!ls -lah {out_path}/'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66'

In [None]:
out_path