# Heart Pre_Processing

In this notebook we pre-process the heart dataset we are using as well as generating synthetic datasets.

SynthVAE is suitable for example datasets in which there are no time series variables. The heart dataset we use in heart.csv is given through a kaggle challenge found here <https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction> and is a combination of multiple open source datasets.

In order to run this notebook we need access to:

- training SynthVAE
- original heart.csv
- adapted RDT module for reproducible results

In [None]:
# Import necessary libraries

import pandas as pd
import numpy as np

import sys

sys.path.append("../")

import torch

# For VAE dataset formatting
from torch.utils.data import TensorDataset, DataLoader

# Opacus support for differential privacy
from opacus.utils.uniform_sampler import UniformWithReplacementSampler

from VAE import VAE, Encoder, Decoder

from utils import general_pre_proc, reverse_transformers, set_seed

#import warnings

#warnings.filterwarnings("ignore")

In [None]:
set_seed(0)

In [None]:
# Load in the heart csv and perform pre-processing

heart_data = pd.read_csv("Heart_Data/Original_Data/heart.csv")

## Pre-Processing Steps

We need to transform the continuous & categorical columns accordingly for synthetic data creation - this is different to the pre-processing required for the actual predictive modelling

In [None]:
# Specify column configs

categorical_columns = [
    "Sex",
    "ChestPainType",
    "FastingBS",
    "RestingECG",
    "ExerciseAngina",
    "ST_Slope",
    "HeartDisease",
]
continuous_columns = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
pre_proc_method = "GMM"

(
    x_train,
    original_metric_set,
    reordered_dataframe_columns,
    continuous_transformers,
    categorical_transformers,
    datetime_transformers,
    num_categories,
    num_continuous,
) = general_pre_proc(
    data_supp=heart_data,
    user_categorical_columns=categorical_columns,
    user_continuous_columns=continuous_columns,
    user_datetime_columns=[],
    pre_proc_method=pre_proc_method,
)

# Synthetic Data Creation

Here we train and generate synthetic data using SynthVAE - we set a number of seeds and create multiple versions which are then saved

In [None]:
n_seeds = 5  # Number of synthetic sets we want to create & test

for repeat_number in range(n_seeds):

    #%% -------- Create & Train VAE -------- #

    # User defined hyperparams
    # General training
    batch_size = 32
    latent_dim = 256
    hidden_dim = 256
    n_epochs = 100
    logging_freq = 1  # Number of epochs we should log the results to the user
    patience = 5  # How many epochs should we allow the model train to see if
    # improvement is made
    delta = 10  # The difference between elbo values that registers an improvement
    filepath = None  # Where to save the best model

    # Privacy params
    differential_privacy = False  # Do we want to implement differential privacy
    sample_rate = 0.1  # Sampling rate
    C = 1e16  # Clipping threshold - any gradients above this are clipped
    noise_scale = None  # Noise multiplier - influences how much noise to add
    target_eps = 10  # Target epsilon for privacy accountant
    target_delta = 1e-3  # Target delta for privacy accountant

    # Prepare data for interaction with torch VAE
    Y = torch.Tensor(x_train)
    dataset = TensorDataset(Y)

    generator = None
    sample_rate = batch_size / len(dataset)
    data_loader = DataLoader(
        dataset,
        batch_sampler=UniformWithReplacementSampler(
            num_samples=len(dataset), sample_rate=sample_rate, generator=generator
        ),
        pin_memory=True,
        generator=generator,
    )

    # Create VAE

    encoder = Encoder(x_train.shape[1], latent_dim, hidden_dim=hidden_dim)
    decoder = Decoder(latent_dim, num_continuous, num_categories=num_categories)

    vae = VAE(encoder, decoder)

    if differential_privacy == False:
        (
            training_epochs,
            log_elbo,
            log_reconstruction,
            log_divergence,
            log_categorical,
            log_numerical,
        ) = vae.train(data_loader, n_epochs=n_epochs)

    elif differential_privacy == True:
        (
            training_epochs,
            log_elbo,
            log_reconstruction,
            log_divergence,
            log_categorical,
            log_numerical,
        ) = vae.diff_priv_train(
            data_loader,
            n_epochs=n_epochs,
            C=C,
            target_eps=target_eps,
            target_delta=target_delta,
            sample_rate=sample_rate,
            noise_scale=noise_scale,
        )
        print(f"(epsilon, delta): {vae.get_privacy_spent(target_delta)}")

    synthetic_sample = vae.generate(heart_data.shape[0])

    # Reverse the transformations

    if torch.cuda.is_available():
        synthetic_sample = pd.DataFrame(
            synthetic_sample.cpu().detach().numpy(), columns=reordered_dataframe_columns
        )
    else:
        synthetic_sample = pd.DataFrame(
            synthetic_sample.detach().numpy(), columns=reordered_dataframe_columns
        )

    synthetic_supp = reverse_transformers(
        synthetic_set=synthetic_sample,
        data_supp_columns=heart_data.columns,
        cont_transformers=continuous_transformers,
        cat_transformers=categorical_transformers,
        date_transformers=None,
        pre_proc_method=pre_proc_method,
    )

    if(differential_privacy==False):
        synthetic_supp.to_csv("Heart_Data/Synthetic_Data/No_DP/synthetic_heart_run_{}.csv".format(repeat_number), index=False)
    else:
        synthetic_supp.to_csv("Heart_Data/Synthetic_Data/DP/synthetic_heart_run_{}.csv".format(repeat_number), index=False)