In [None]:
%load_ext autoreload
%autoreload 2

# Synthetic data generation for demo purposes
- generate an target variable that is hierarchically structured like ICD10, with a large label space and exponential class imbalance
- generate features with dependencies to each other and the target variable
- generate different class distributions for different hospitals to simulate heterogeneity 
- corrupt the target label with categorical shift and swapped values to simulate label noise

# Standard preprocessing
- load data into ICD10data class (for preprocessing)
- conduct stratified train, calibrate, test split
- export data to csv

In [14]:
import pickle
import polars as pl

# Generate synthetic data

(around 30.000 day admissions per year per hospital)

In [None]:
samples_per_hospital=150000 # 5 years of data
hospital_specializations = ['cardiology', 'neurology', 'oncology', 'academic', 'general'] # 5 different distributions of the target classes

In [15]:
from synthetic_admissions import generate_federated_sources

# parameters
p=0.6 # proportion of common codes despite hospital specialization (the bigger the less heterogeneity)
variance=0.025 # variance of the noise
p2=0.5 # power factor for the digits
seed=42 # random seed
corr = 0.89 # correlation to target
corr2 = 0.78 # correlation to target

noisy_federated_data = generate_federated_sources(
    hospital_specializations=hospital_specializations,
    samples_per_hospital=samples_per_hospital,
    p=p,
    variance=variance,
    p2=p2,
    corr=corr,
    corr2=corr2,
    seed=seed,
)

In [None]:
# generate synthetic data from a different distribution for each hospital specialization
federated_data_sources = generate_federated_sources(
    hospital_specializations=hospital_specializations,
    samples_per_hospital=samples_per_hospital,
    p=p,
    variance=variance,
    p2=p2,
    seed=seed
)
federated_data_sources.head()

In [17]:
export_csv_path = "./data/synthetic_admission_data.csv"
federated_data_sources.write_csv(export_csv_path)

# Corrupt target label to simulate label noise
(26 min runtime for 750k rows)

In [18]:
from synthetic_admissions import corrupt_target_label

# corrupt the target label by swapping 1/5th of the labels and categorical shift (close to the target) by 4/5th 
noisy_federated_data = corrupt_target_label(federated_data_sources, noise_rate=0.2, seed=42)

In [19]:
data_path = "./data/synthetic_noisy_admission_data.csv"
noisy_federated_data.write_csv(data_path)

# Load data

In [2]:
import os
from uncertainty_aware_diagnosis import ICD10data

input_csv =  "./data/synthetic_noisy_admission_data.csv" # "./data/synthetic_admission_data.csv"

# settings for the data_config
data_config = {
  'target': 'admission_code', # similated but fake ICD10 code
  'numerical_features': ['age'],
  'categorical_features': ['hospital','gender', 'clinical_specialty', 'billing_diagnosis_code', 'billing_specialty_code', 'subtraject_code'],
  'high_cardinality_features': ['procedure_code'],
  'use_embedding': False,
  'train_csv': "./data/synthetic_train.csv",
  'val_csv': "./data/synthetic_val.csv",
  'test_csv': "./data/synthetic_test.csv",
  'ohe_pkl': "./data/synthetic_ohe_cats.pkl"
}

# store data_config in a python file in the data folder
# Ensure the directory exists
data_config_path = "./data/data_config.py"
os.makedirs(os.path.dirname(data_config_path), exist_ok=True)

# Write the data_config to a Python file
with open(data_config_path, "w") as f:
    f.write("data_config = ")
    f.write(repr(data_config))

print(f"data_config stored in {data_config_path}")

In [21]:
data = ICD10data(
    csv_path=input_csv,
    numerical=data_config['numerical_features'],
    categorical=data_config['categorical_features'],
    high_card=[],
    target=data_config['target'],
    dropna=True,
    use_embedding=False,
)

# save the encoder categories of the complete dataset (to be reused for the train, val, test set)
with open('synthetic_ohe_cats.pkl', 'wb') as f:
    pickle.dump(data.encoder.categories, f)

# Train, calibrate, test split

In [None]:
from uncertainty_aware_diagnosis import split_and_save_csv

split_and_save_csv(
    input_csv=input_csv,
    train_csv=data_config['train_csv'],
    val_csv=data_config['val_csv'],
    test_csv=data_config['test_csv'],
    train_frac=0.7,
    val_frac=0.15,
    test_frac=0.15,
    stratify_col=data_config['target'],  # preserve class balance
    min_class_count=25,
    random_state=42,
)

In [None]:
train = pl.read_csv(data_config['train_csv'])

print("Total classes: ", train.get_column("admission_code").n_unique())

Splits sizes: train=522554, val=111976, test=111977 | Total classes:  793