In [1]:
import pandas as pd
import numpy as np
import pathlib
from tqdm.auto import tqdm

import hydra
from omegaconf import DictConfig, OmegaConf

import torch
from pytorch_lightning import seed_everything

import ray

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retinal_risk"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

/sc-projects/sc-proj-ukb-cvd


In [3]:
output_path

'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data'

## Get Data

In [None]:
# get stuff from wandb and create symlinks to current folder!!

In [12]:
from hydra import compose, initialize
from omegaconf import OmegaConf
hydra.core.global_hydra.GlobalHydra().clear()

initialize(config_path="../../ehrgraphs/config")
args = compose(config_name="config", overrides=["datamodule.partition=0", 
                                                "datamodule.use_top_n_phecodes=10000",
                                                "setup.use_data_artifact_if_available=False",
                                                "datamodule/covariates='no_covariates'",
                                                "datamodule.t0_mode=recruitment",
                                                
                                               ])
print(OmegaConf.to_yaml(args))

setup:
  entity: cardiors
  project: retina
  group: null
  name: null
  root:
    charite-hpc: /sc-projects/sc-proj-ukb-cvd
    eils-hpc: /data/analysis/ag-reils/ag-reils-shared/cardioRS
  data_path: data/2_datasets_pre/211110_anewbeginning/artifacts
  output_path: results/models
  use_data_artifact_if_available: false
  data: null
  restore_id: null
  data_identifier: WandBBaselineData:latest
  tags:
  - baseline_data
head:
  model_type: MLP
  dropout: 0.0
  kwargs:
    num_hidden: 256
    num_layers: 1
    detach_clf: false
    initial_dropout: 0.0
datamodule:
  covariates: []
  augmentation:
    train: []
    valid: []
    test: []
  batch_size: 1024
  partition: 0
  num_workers: 4
  img_root: /sc-projects/sc-proj-ukb-cvd/data/retina/preprocessed/preprocessed
  img_visit: 0
  img_size_to_gpu: 420
  img_crop_ratio:
    train:
    - 0.3
    - 0.4
    - 0.5
    - 0.6
    - 0.7
    - 0.8
    test: 0.5
    valid: 0.5
  img_n_testtime_views: 10
  label_definition:
    all_cause_death: tr

In [32]:
def extract_records_events_times(args):
    
    records_list = []
    outcomes_list = []
    
    # prepare extraction
    datamodule, _, _ = setup_training(args)
    
#     record_cols = datamodule.record_cols
    label_cols = list(datamodule.label_mapping.keys())
    
    for s in tqdm(["train", "valid", "test"]):
        eids = datamodule.eids[s]
        
        if s=="train":  dataset = datamodule.train_dataloader(shuffle=False, drop_last=False).dataset
        if s=="valid":  dataset = datamodule.val_dataloader().dataset
        if s=="test":  dataset = datamodule.test_dataloader().dataset

        # extract records
#         records_temp = pd.DataFrame.sparse.from_spmatrix(dataset.records, index=eids, columns=[f"{c}" for c in record_cols]).rename_axis("eid")
#         records_list.append(records_temp)

        # extract exclusion & events
        exclusions_df = dataset.exclusions
        events_df = dataset.labels_events

        times = dataset.labels_times
        censorings = dataset.censorings

        no_event_idxs = times == 0
        print(censorings.shape)
        print(times.shape)
        print(no_event_idxs.shape)
        print(censorings.values.repeat((1, times.shape[1])))
        
        times[no_event_idxs] = censorings.values.repeat((1, times.shape[1]))[no_event_idxs]
        
        1/0

        times_df = pd.DataFrame(data=times, index=eids, columns=[f"{c}_time" for c in label_cols]).rename_axis("eid")

        outcomes_temp = pd.concat([exclusions_df, events_df, times_df], axis=1)
        outcomes_list.append(outcomes_temp)
        
#     records_df = pd.concat(records_list, axis=0)
    outcomes_df = pd.concat(outcomes_list, axis=0)
        
    return outcomes_df

In [33]:
from retinalrisk.training import setup_training
seed_everything(0)

#args = compose(config_name="config", overrides=[f"datamodule.partition={partition}"])
records_df, outcomes_df = extract_records_events_times(args)

Global seed set to 0


/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/phecode_definitions_220328.feather
/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/eids_211209.yaml
/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/baseline_covariates_220503.feather
/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/baseline_outcomes_220412.feather
Labels are...
['OMOP_4306655', 'phecode_052', 'phecode_052-1', 'phecode_052-3', 'phecode_052-32', 'phecode_056', 'phecode_056-1', 'phecode_070', 'phecode_089', 'phecode_089-1', 'phecode_089-2', 'phecode_089-3', 'phecode_099', 'phecode_101', 'phecode_101-4', 'phecode_103', 'phecode_103-2', 'phecode_103-21', 'phecode_105', 'phecode_105-1', 'phecode_106', 'phecode_107', 'phecode_107-2', 'phecode_112', 'phecode_130', 'phecode_136', 'phecode_136-4', 'phecode_136-41', 'phecode_136-42', 'phecode_138', 'phecode_138-2', 'phecode_139', 'phecode_139-5', 'phecode_14

  0%|          | 0/3 [00:00<?, ?it/s]

(47354, 1)
(47354, 498)
(47354, 498)


ValueError: operands could not be broadcast together with shape (47354,) (2,)

## Write Records

In [None]:
records_df.info()

In [7]:
for c in tqdm(records_df.columns):
    records_df[c] = records_df[c].astype(bool).sparse.to_dense()

  0%|          | 0/68527 [00:00<?, ?it/s]

In [45]:
records_df = records_df.sort_index()

In [46]:
records_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 502460 entries, 1000018 to 6025198
Columns: 68527 entries, OMOP_1000560 to OMOP_998415
dtypes: bool(68527)
memory usage: 32.1 GB


In [47]:
records_df.reset_index().to_feather(f"{output_path}/baseline_records_{artifact_date}.feather")

## Write Outcomes

In [10]:
for c in tqdm(outcomes_df.columns):
    if c.endswith("_prev") or c.endswith("_event"):
        outcomes_df[c] = outcomes_df[c].astype(bool).sparse.to_dense()
    if c.endswith("_time"):
        outcomes_df[c] = outcomes_df[c].astype(np.float32)

  0%|          | 0/5052 [00:00<?, ?it/s]

In [48]:
outcomes_df = outcomes_df.sort_index()

In [49]:
outcomes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 502460 entries, 1000018 to 6025198
Columns: 5052 entries, OMOP_4306655_prev to phecode_240_time
dtypes: bool(3368), float32(1684)
memory usage: 4.7 GB


In [50]:
outcomes_df.reset_index().to_feather(f"{output_path}/baseline_outcomes_{artifact_date}.feather")

### Outcomes long

In [51]:
endpoints = sorted(outcomes_df.columns.str.replace("_prev|_event|_time", "", regex=True).unique().tolist())

In [52]:
outcomes_long = pd.DataFrame()

In [53]:
outcomes_df_list = []
cols = ["prev", "event", "time"]
for e in tqdm(endpoints):
    temp = outcomes_df[[f"{e}_{c}" for c in cols]].assign(endpoint = e)
    temp.columns = cols + ["endpoint"]
    outcomes_df_list.append(temp)

  0%|          | 0/1684 [00:00<?, ?it/s]

In [54]:
outcomes_long = pd.concat(outcomes_df_list,
                          axis=0)[["endpoint"] + cols].assign(endpoint = lambda x: x.endpoint.astype("category")).reset_index()

In [55]:
outcomes_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846142640 entries, 0 to 846142639
Data columns (total 5 columns):
 #   Column    Dtype   
---  ------    -----   
 0   eid       int64   
 1   endpoint  category
 2   prev      bool    
 3   event     bool    
 4   time      float32 
dtypes: bool(2), category(1), float32(1), int64(1)
memory usage: 12.6 GB


In [56]:
outcomes_long.to_feather(f"{output_path}/baseline_outcomes_long_{artifact_date}.feather")