In [1]:
import numpy as np
import pandas as pd
import pathlib
from tqdm.auto import tqdm

import hydra
from omegaconf import DictConfig, OmegaConf

import torch
#from torch_geometric import seed_everything

import ray

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas_220603_fullrun"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '220603_fullrun'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

name_dict = {
    "predictions_cropratio0.3": "ConvNextSmall(Retina)+MLP_cropratio0.3",
    "predictions_cropratio0.5": "ConvNextSmall(Retina)+MLP_cropratio0.5",
    "predictions_cropratio0.8": "ConvNextSmall(Retina)+MLP_cropratio0.8",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


In [3]:
endpoint_columns = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

In [4]:
#ray.shutdown()
#ray.init(num_cpus=24)
ray.init(address='auto')

RayContext(dashboard_url='', python_version='3.9.7', ray_version='1.12.1', ray_commit='4863e33856b54ccf8add5cbe75e41558850a1b75', address_info={'node_ip_address': '10.32.105.3', 'raylet_ip_address': '10.32.105.3', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-06-22_23-07-33_053617_2867176/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-06-22_23-07-33_053617_2867176/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-06-22_23-07-33_053617_2867176', 'metrics_export_port': 63529, 'gcs_address': '10.32.105.3:6379', 'address': '10.32.105.3:6379', 'node_id': '73cd632bd19b155e9337bb2335f96117c6cde321e3b587c86b37604b'})

In [5]:
import wandb
api = wandb.Api()
entity, project = "cardiors", "retina"  # 'll-cha'
tag = '220603_fullrun'
runs = api.runs(entity + "/" + project, filters={"tags": {"$in": [tag]}}) 

In [6]:
run_list = []
for run in tqdm(runs): 
    run_list.append(
        {
            #"id": run.path[-1], 
            "id": run.id, 
            "name": run.name,
            "tags": run.tags,
            "config": {k: v for k,v in run.config.items() if not k.startswith('_')},
            "summary": run.summary._json_dict,
            #"path": None if "predictions_path" not in run.config.keys() else str(pathlib.Path(run.config["predictions_path"]))
            'path': f'/sc-projects/sc-proj-ukb-cvd/results/models/retina/{run.id}/checkpoints/predictions/'
        }
    )

  0%|          | 0/22 [00:00<?, ?it/s]

In [7]:
runs_df = pd.DataFrame(run_list)
print(runs_df['path'].iloc[0])

/sc-projects/sc-proj-ukb-cvd/results/models/retina/i4np1grc/checkpoints/predictions/


In [8]:
#tag = experiment # or 'your_experiment_tag'
# runs_df = runs_df[runs_df.tags.astype(str).str.contains(tag)].query("path==path")
#runs_df = runs_df[runs_df.id.astype(str).str.contains(tag)].query("path==path")
#runs_df

## Process Predictions

In [11]:
id_vars = ["eid", "model", "partition", "split"]

In [12]:
out_path = f"{experiment_path}/loghs"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [13]:
@ray.remote
def prepare_predictions(in_path, out_path):
    for cr in name_dict.keys():
        in_path_cr = f'{in_path}{cr}.feather'
        temp = pd.read_feather(in_path_cr).rename(columns={"index": "eid"}).set_index('eid')
        # mean duplicated indices aka left and right eye value per eid
        temp_with_meaned_test_preds = temp.groupby(level=0).mean()
        for col in temp.columns.values:
            if col not in endpoint_columns:
                temp_with_meaned_test_preds[col] = temp[col].iloc[~temp.index.duplicated()].values
        temp = temp_with_meaned_test_preds.reset_index(drop=False)

        # rename 10_1_Ft__ and dropping ft and St cols if present
        cols_to_drop = []
        cols_to_rename = {}
        for col in temp.columns.values:
            if 'ft' in col:
                cols_to_drop.append(col)
            elif 'St' in col:
                cols_to_drop.append(col)
            elif 'Ft' in col:
                cols_to_rename[col] = col.replace('1_10_Ft__', '')

        temp['record_cols'] = None

        temp["model"] = (temp.module.astype(str) + "_" + temp.covariate_cols.astype(str) + "_" + temp.encoder.astype(str) + "_" + temp["head"].astype(str)).astype("category")
        temp = temp.replace({"model":name_dict}).drop(columns=["module", "encoder", "head", "covariate_cols", "record_cols"]).drop(columns=cols_to_drop).rename(columns=cols_to_rename)
        for c in id_vars: temp[c] = temp[c].astype("category")

        model = temp.model.unique()[0]
        model = f'{model}_{cr}'
        partition = temp.partition.unique()[0]
        for split in ["train", "valid", "test"]: #"test_left", 'test_right'
            fp_out = f"{out_path}/{model}/{partition}"
            pathlib.Path(fp_out).mkdir(parents=True, exist_ok=True)
            t = temp.query("split==@split")
            t.reset_index(drop=True).to_feather(f"{fp_out}/{split}.feather")
            print(f"{fp_out}/{split}.feather")

In [14]:
for row_idx in tqdm(range(len(runs_df))): 
    row = runs_df.iloc[row_idx]
    p = row['path']
    id = row['id']
    prepare_predictions.remote(p, out_path)

  0%|          | 0/22 [00:00<?, ?it/s]

[2m[1m[36m(scheduler +34s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[2m[36m(prepare_predictions pid=2940331)[0m /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas_220603_fullrun/data/220603_fullrun/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.3/18/train.feather
[2m[36m(prepare_predictions pid=2940330)[0m /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas_220603_fullrun/data/220603_fullrun/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.3/19/train.feather
[2m[36m(prepare_predictions pid=2940331)[0m /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas_220603_fullrun/data/220603_fullrun/loghs/ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.3/18/valid.feather
[2m[36m(prepare_predictions pid=2940331)[0m /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas_220603_fullrun/data/220603_fullrun/loghs/ImageTraining_[]_ConvNeX

In [None]:
out_path

In [None]:
!ls /sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas_220608/data/test_experiment/loghs/