In [None]:
import torch
import pandas as pd
import os
from os.path import join

BASE = 
TABULAR_BASE = join(BASE,'tabular')

In [None]:
def check_array_equality(ob1, ob2):
  if torch.is_tensor(ob1) or isinstance(ob1, np.ndarray):
    assert (ob2 == ob1).all()
  else:
    assert ob2 == ob1

def check_or_save(obj, path, index=None, header=None):
  if isinstance(obj, pd.DataFrame):
    if index is None or header is None:
      raise ValueError('Index and header must be specified for saving a dataframe')
    if os.path.exists(path):
      if not header:
        saved_df = pd.read_csv(path,header=None)
      else:
        saved_df = pd.read_csv(path)
      naked_df = saved_df.reset_index(drop=True)
      naked_df.columns = range(naked_df.shape[1])
      naked_obj = obj.reset_index(drop=not index)
      naked_obj.columns = range(naked_obj.shape[1])
      if naked_df.round(6).equals(naked_obj.round(6)):
        return
      else:
        diff = (naked_df.round(6) == naked_obj.round(6))
        diff[naked_df.isnull()] = naked_df.isnull() & naked_obj.isnull()
        assert diff.all().all(), "Dataframe is not the same as saved dataframe"
    else:
      obj.to_csv(path, index=index, header=header)
  else:
    if os.path.exists(path):
      saved_obj = torch.load(path)
      if isinstance(obj, list):
        for i in range(len(obj)):
          check_array_equality(obj[i], saved_obj[i])
      else:
        check_array_equality(obj, saved_obj)
    else:
      print(f'Saving to {path}')
      torch.save(obj, path)

## Regression Targets

In [None]:
# Create label files for LVM (g) regression
tabular_df = pd.read_csv(join(TABULAR_BASE,'cardiac_features_imputed_noOH.csv'))
tabular_df.set_index('eid', inplace=True)

train_ids = torch.load(join(TABULAR_BASE,'ids_train_tabular_imaging.pt'))
val_ids = torch.load(join(TABULAR_BASE,'ids_val_tabular_imaging.pt'))
test_ids = torch.load(join(TABULAR_BASE,'ids_test_tabular_imaging.pt'))

def grab_target_in_split(df, _ids, target):
    split_df = df.loc[_ids]
    return list(split_df[target])

targets = ['LVESV (mL)','LVEDV (mL)','LVSV (mL)','LVEF (%)','LVCO (L/min)','LVM (g)','RVEDV (mL)','RVESV (mL)','RVSV (mL)','RVEF (%)']
for _ids, split in zip([train_ids, val_ids, test_ids], ['train', 'val', 'test']):
    split_df = tabular_df.loc[_ids]
    check_or_save(torch.tensor(split_df[targets].to_numpy(),dtype=torch.float32), join(TABULAR_BASE,f'labels_{split}_ImagingDerived_regression.pt'), header=False, index=False)

## Infarct & CAD & Diabetes

In [None]:
# ICD Based Targets
train_ids = torch.load(join(TABULAR_BASE,'ids_train_tabular_imaging.pt'))
val_ids = torch.load(join(TABULAR_BASE,'ids_val_tabular_imaging.pt'))
test_ids = torch.load(join(TABULAR_BASE,'ids_test_tabular_imaging.pt'))

tabular_df = pd.read_csv(join(TABULAR_BASE,'cardiac_feature_668815_vector_labeled_noOH.csv'))
tabular_df.set_index('eid', inplace=True)
# Create multi-class label files for past and future ICD codes

cleaned_features_path = join(TABULAR_BASE,'cardiac_features_668815_clean.csv')
data_df = pd.read_csv(cleaned_features_path)

date_attended_imaging = pd.read_csv(join(TABULAR_BASE,'col67.txt'))
date_attended_imaging.rename(columns={'eid':'eid','53-2.0':'Date of attending imaging centre-2.0'},inplace=True)
data_df_extended = data_df.merge(date_attended_imaging, left_on='eid', right_on='eid', how='inner')
assert len(data_df_extended) == len(data_df)

for target_name in ['CAD', 'Infarct', 'Diabetes']:
  if target_name == 'CAD':
    target = ['I200', 'I201', 'I208', 'I209', 
          'I220', 'I221', 'I228', 'I229',
          'I210', 'I211', 'I212', 'I213', 'I214', 'I219',
          'I240', 'I248', 'I249'
          'I250', 'I251', 'I252', 'I253', 'I254', 'I255', 'I256', 'I258', 'I259']
  elif target_name == 'Infarct':
    target = ['I210', 'I211', 'I212', 'I213', 'I214', 'I219', 'I220', 'I221', 'I228', 'I229']
  elif target_name == 'Diabetes':
    target = ['E100','E101','E102','E103','E104','E105','E106','E107','E108','E109','E110','E111','E112','E113','E114','E115','E116','E117','E118','E119','E121','E123','E125','E128','E129','E130','E131','E132','E133','E134','E135','E136','E137','E138','E139','E140','E141','E142','E143','E144','E145','E146','E147','E148','E149']

  array_length = 243
  diag_name = 'Diagnoses - ICD10-0.'
  date_name = 'Date of first in-patient diagnosis - ICD10-0.'
  all_target_dates = []
  all_target_indices = []
  all_target_ids = []
  for i in range(array_length):
    all_target_dates.extend(list(data_df[data_df[f'{diag_name}{i}'].isin(target)][f'{date_name}{i}']))
    all_target_indices.extend(list(data_df[data_df[f'{diag_name}{i}'].isin(target)].index))
    all_target_ids.extend(list(data_df[data_df[f'{diag_name}{i}'].isin(target)]['eid']))

  date_attending_centre = []
  for i in all_target_indices:
    date_attending_centre.append(data_df_extended.loc[i,'Date of attending imaging centre-2.0'])
  date_attending_centre = pd.Series(date_attending_centre).astype('datetime64[ns]')

  target_df = pd.DataFrame({'eid':all_target_ids,'target date':all_target_dates,'imaging date':date_attending_centre})
  for time in ['all']:
    if time == 'future':
      target_ids = target_df[target_df['target date']>target_df['imaging date']]['eid']
    elif time == 'past':
      target_ids = target_df[target_df['target date']<target_df['imaging date']]['eid']
    else:
      target_ids = target_df['eid']

    for _ids, balance, split in zip([train_ids, train_ids, val_ids, val_ids, test_ids], [True, False, True, False, False], ['train', 'train', 'val', 'val', 'test']):
        addendum = ''
        split_df = tabular_df.loc[_ids]
        if balance:
            addendum = '_balanced'
            positive_df = split_df[split_df.index.isin(target_ids)]
            negative_df = split_df[~split_df.index.isin(target_ids)]
            negative_df_balanced = negative_df.sample(len(positive_df), random_state=2023)
            subset_df = pd.concat([positive_df, negative_df_balanced])
            subset_ids = list(subset_df.index)
            subset_ids.sort()
            check_or_save(subset_ids, join(TABULAR_BASE, f'ids_{split}_{target_name}_{time}_balanced.pt'))
            split_df = tabular_df.loc[subset_ids]
        labels = list(split_df.index.isin(target_ids).astype(int))
        check_or_save(labels, join(TABULAR_BASE, f'labels_{split}_{target_name}_{time}{addendum}.pt'))