In [11]:
import numpy as np
import torch 
import sys
import os
parent_dir = os.path.dirname(os.path.abspath(''))
sys.path.append(parent_dir)

import data_access.base_loader as base_loader
import data_access.ricu_loader as ricu_loader
import os
import datetime
import wandb
import ast
import logging
import json

import timeautodiff.processing_simple as processing
import timeautodiff.helper_simple as tdf_helper
import timeautodiff.timeautodiff_v4_efficient_simple as timeautodiff
import evaluation_framework.vis as vis
import tqdm.notebook


In [5]:
# splitting parameters
train_fraction = 0.45
val_fraction = 0.1
oracle_fraction = 0
oracle_min = 100
intersectional_min_threshold = 100
intersectional_max_threshold = 1000


# # data parameters
data_name = 'mimic' # 'mimic' 'eicu'
task_name = 'mortality24' # 'aki' 'kidney_function' 'los' 'los_24' 'mortality24' 
static_var = 'ethnicity'
features = None
ricu_dataset_path = f'../raw_data/{task_name}/{data_name}'
processed_output_path = f'outputs/{task_name}/{data_name}/processed/'
intermed_output_path = f'outputs/{task_name}/{data_name}/intermed/'
seed = 0

simple_imputation = True
mode = 'processed'
processed_data_timestamp = '20250527114407'  # change this to the timestamp of the processed data
intermed_data_timestamp = None

standardize = False
save_intermed_data = True
save_processed_data = True
split = True
stratify =  False
intersectional = False

if split == False:
    split_text = 'No Split'
else:
    split_text = 'Split'
data_params = {
    'processed_data_timestamp':processed_data_timestamp,
    'task_name': task_name,
    'data_name': data_name,
    'train_fraction': train_fraction,
    'val_fraction': val_fraction,
    'test_fraction': 1 - train_fraction - val_fraction,
    'oracle_fraction': oracle_fraction,
    'oracle_min': oracle_min,
    'intersectional_min_threshold': intersectional_min_threshold,
    'intersectional_max_threshold': intersectional_max_threshold,
    'split': split_text,
    'standardize' : standardize,
}

loader = ricu_loader.RicuLoader(seed, task_name, data_name,static_var,ricu_dataset_path,simple_imputation,
                                    features, processed_output_path,intermed_output_path)





X_dict_tf, y_dict, static = loader.get_data(
    mode='processed', 
    train_fraction=train_fraction,
    val_fraction=val_fraction,
    oracle_fraction=oracle_fraction,
    oracle_min=oracle_min,
    intersectional_min_threshold=intersectional_min_threshold,
    intersectional_max_threshold=intersectional_max_threshold,
    stratify=stratify,
    intersectional=intersectional,
    save_intermed_data=False,
    save_processed_data=False,
    demographics_to_stratify_on = ['age_group','ethnicity','gender'],
    processed_timestamp=processed_data_timestamp
)
    
if not isinstance(X_dict_tf, dict):
    X_dict_tf = {file: X_dict_tf[file] for file in X_dict_tf.files}
    y_dict = {file: y_dict[file] for file in y_dict.files}

X_dict_tf.keys()



dict_keys(['X_original_train', 'X_original_val', 'X_original_test', 'X_imputed_train', 'X_imputed_val', 'X_imputed_test', 'm_train', 'm_val', 'm_test', 'delta_t_train', 'delta_t_val', 'delta_t_test', 'feature_names'])

In [6]:

# most_important_features = [19, 27, 17, 35, 22, 44, 42, 43, 37, 26]
X_train = X_dict_tf['X_imputed_train'][:,:,:]
X_holdout = X_dict_tf['X_imputed_test'][:,:,:]
X_holdout_val = X_dict_tf['X_imputed_val'][:,:,:]

m_train = X_dict_tf['m_train'][:,:,:]
m_holdout = X_dict_tf['m_test'][:,:,:]
m_holdout_val = X_dict_tf['m_val'][:,:,:]

feature_names = X_dict_tf['feature_names'][:]
y_train = y_dict['y_train'][:]
y_holdout = y_dict['y_test'][:]
y_holdout_val = y_dict['y_val'][:]


static_feature_to_include = ['ethnicity','gender','age_group']
static_features_to_include_indices = sorted([y_dict['feature_names'].tolist().index(include)  for include in static_feature_to_include])
c_train = y_dict['c_train'][:,static_features_to_include_indices]
c_holdout = y_dict['c_test'][:,static_features_to_include_indices]
c_holdout_val = y_dict['c_val'][:,static_features_to_include_indices]

cond_names = y_dict['feature_names'][static_features_to_include_indices]



top10_important_features = [19, 27, 17, 35, 22, 44, 42, 43, 37, 26]
top3_important_features = [44,42,43]
top6_important_features = [42, 22, 27, 35, 43, 17]

important_features_names = X_dict_tf['feature_names'][top10_important_features]
important_features_names

X_train_10 = processing.normalize_and_reshape(X_train)
X_train_10 = X_train_10[:,:,top10_important_features]

print('Shape of X train:', X_train.shape)
print('Shape of X Holdout:', X_holdout.shape)
print('Shape of X Holdout val:', X_holdout_val.shape)

print('Shape of y train:', y_train.shape)
print('Shape of y Holdout:', y_holdout.shape)
print('Shape of y Holdout val:', y_holdout_val.shape)

print('Shape of c train:', c_train.shape)
print('Shape of c Holdout:', c_holdout.shape)
print('Shape of c Holdout val:', c_holdout_val.shape)


Shape of X train: (17157, 48, 25)
Shape of X Holdout: (17157, 48, 25)
Shape of X Holdout val: (3812, 48, 25)
Shape of y train: (17157,)
Shape of y Holdout: (17157,)
Shape of y Holdout val: (3812,)
Shape of c train: (17157, 3)
Shape of c Holdout: (17157, 3)
Shape of c Holdout val: (3812, 3)


## Model Loading

In [7]:
################################################################################################################
# Model Evaluation
################################################################################################################
output_dir = f'outputs/{task_name}/{data_name}/TimeAutoDiff/'
latest_diffusion_timestamp = sorted(os.listdir(output_dir))[-1]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"############ Evaluating timestamp {latest_diffusion_timestamp}: ############")

model = tdf_helper.load_models_only(latest_diffusion_timestamp, task_name, data_name)



############ Evaluating timestamp 20250527_122626_10features_v4_efficient_simple_mimic_mortality24: ############
Latent features loaded successfully.
Latent features shape: torch.Size([17157, 25, 10])


In [9]:
response_train, outcome_train, static_train, time_info_train = processing.process_data_for_synthesizer(X_train, y_train, c_train, top10_important_features)
cond_train = torch.concatenate((static_train, outcome_train), axis=2)
response_train = response_train.float()
time_info_train = time_info_train.float()
cond_train = cond_train.float()


## Sampling

In [13]:
synth_data_list = []
synth_data_y_list = []



n_generations = 2
for i in tqdm.notebook.tqdm(range(n_generations), desc="Generating Synthetic Data", leave=True):



    _synth_data = tdf_helper.generate_synthetic_data_in_batches(model, cond_train, time_info_train, 
                                                                       batch_size = 10000)
    _synth_data_y = cond_train[:, 0, -1]
    synth_data_list.append(_synth_data.cpu().numpy())
    synth_data_y_list.append(_synth_data_y.cpu().numpy().reshape(-1,))




Generating Synthetic Data:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 124.00 MiB. GPU 0 has a total capacity of 39.38 GiB of which 22.00 MiB is free. Process 3372021 has 678.00 MiB memory in use. Process 3726096 has 4.51 GiB memory in use. Process 4160385 has 22.46 GiB memory in use. Process 447704 has 798.00 MiB memory in use. Process 1480267 has 8.30 GiB memory in use. Process 1693100 has 928.00 MiB memory in use. Including non-PyTorch memory, this process has 1.71 GiB memory in use. Of the allocated memory 1.04 GiB is allocated by PyTorch, and 177.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)