# **Data Loading and Preprocessing**

In [None]:
import pandas as pd

# Replace 'your_dataset.csv' with the actual path to your CSV file
file_path = '/content/imputed_encoded_dataset.csv'
new_file_path = 'cleaned_dataset.csv'  # Define the path for the cleaned dataset

# Load the dataset
df = pd.read_csv(file_path)

# Drop rows with missing values in these columns
df_cleaned = df.dropna()

# Save the cleaned dataset to a new CSV file, without the index
df_cleaned.to_csv(new_file_path, index=False)


In [None]:
print(df_cleaned.shape)

(75739, 68)


# **Synthetic Data Generation Using VAE**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/cleaned_dataset.csv')

# Define the list of continuous columns
continuous_cols = ['careplan_length','Diastolic Blood Pressure','Systolic Blood Pressure','Body Mass Index','Total Cholesterol','High Density Lipoprotein Cholesterol','Triglycerides','Low Density Lipoprotein Cholesterol','Glucose','Hemoglobin A1c/Hemoglobin.total in Blood','Sodium','Chloride','Potassium','Carbon Dioxide','Calcium','Urea Nitrogen','Estimated Glomerular Filtration Rate']  # Replace [...] with your continuous column names

# Assuming all other columns are binary except the continuous ones
all_cols = df.columns.tolist()
binary_cols = [col for col in all_cols if col not in continuous_cols and col != 'Viral_sinusitis_present']  # Exclude target if it's part of the DataFrame

# Preprocess the data
continuous_data = df[continuous_cols].values
binary_data = df[binary_cols].values

# Standardizing continuous variables
scaler = StandardScaler()
continuous_data = scaler.fit_transform(continuous_data)

# Splitting the dataset
cont_train, cont_test, binary_train, binary_test = train_test_split(continuous_data, binary_data, test_size=0.2, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, continuous_data, binary_data):
        self.continuous_data = continuous_data
        self.binary_data = binary_data

    def __len__(self):
        return len(self.continuous_data)

    def __getitem__(self, idx):
        return self.continuous_data[idx], self.binary_data[idx]

# Creating datasets and dataloaders
train_dataset = CustomDataset(torch.tensor(cont_train, dtype=torch.float), torch.tensor(binary_train, dtype=torch.float))
test_dataset = CustomDataset(torch.tensor(cont_test, dtype=torch.float), torch.tensor(binary_test, dtype=torch.float))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

class VAE(nn.Module):
    def __init__(self, continuous_dims, binary_dims, hidden_dims=256, latent_dims=64):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(continuous_dims + binary_dims, hidden_dims),
            nn.ReLU(),
            nn.Linear(hidden_dims, latent_dims * 2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dims, hidden_dims),
            nn.ReLU(),
            nn.Linear(hidden_dims, continuous_dims + binary_dims)
        )

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        encoded = self.encoder(x)
        mu, log_var = encoded.chunk(2, dim=-1)
        z = self.reparameterize(mu, log_var)
        decoded = self.decoder(z)
        return decoded, mu, log_var

def loss_function(recon_x, x, mu, log_var, continuous_dims):
    recon_x_cont, recon_x_binary = recon_x.split([continuous_dims, x.size(1)-continuous_dims], dim=1)
    x_cont, x_binary = x.split([continuous_dims, x.size(1)-continuous_dims], dim=1)
    BCE = nn.BCEWithLogitsLoss()(recon_x_binary, x_binary)
    MSE = nn.MSELoss()(recon_x_cont, x_cont)
    KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return BCE + MSE + KLD

model = VAE(len(continuous_cols), len(binary_cols))
optimizer = optim.Adam(model.parameters(), lr=1e-3)

epochs = 100  # Adjust based on your needs
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for cont_data, binary_data in train_loader:
        optimizer.zero_grad()
        recon_data, mu, log_var = model(torch.cat((cont_data, binary_data), dim=1))
        loss = loss_function(recon_data, torch.cat((cont_data, binary_data), dim=1), mu, log_var, len(continuous_cols))
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

    print(f"Epoch {epoch}, Loss: {train_loss / len(train_loader)}")


In [None]:
def generate_synthetic_data(model, n_samples, scaler, continuous_dims, binary_dims):
    with torch.no_grad():
        model.eval()
        # Sample from the latent space
        z = torch.randn(n_samples, 64)  # Adjust the size as per your model's latent dimensions
        # Decode the sampled latent variables
        synthetic_data = model.decoder(z)

        # Ensure the model's decoder output is handled correctly:
        # The decoder should output a tensor that combines both continuous and binary data,
        # which we then need to split.
        synthetic_cont, synthetic_binary_logits = synthetic_data.split([continuous_dims, binary_dims], dim=1)

        # Sigmoid activation to convert logits to probabilities for binary data
        synthetic_binary = torch.sigmoid(synthetic_binary_logits)

        # Convert probabilities to binary (0 or 1) based on a threshold (0.5)
        synthetic_binary = (synthetic_binary > 0.5).float()

        # Inverse transform the continuous variables to their original scale
        synthetic_cont_np = synthetic_cont.cpu().numpy()
        synthetic_cont_np = scaler.inverse_transform(synthetic_cont_np)

        synthetic_binary_np = synthetic_binary.cpu().numpy()

        return synthetic_cont_np, synthetic_binary_np


In [None]:
# Generate synthetic data
n_synthetic_samples = 1000  # The desired number of synthetic samples
synthetic_cont, synthetic_binary = generate_synthetic_data(model, n_synthetic_samples, scaler, continuous_dims, binary_dims)

# Combine continuous and binary parts
synthetic_data_combined = np.hstack((synthetic_cont, synthetic_binary))

# Create a DataFrame with appropriate column names and save to CSV
synthetic_df = pd.DataFrame(synthetic_data_combined, columns=continuous_cols + binary_cols)
synthetic_df.to_csv('synthetic_dataset.csv', index=False)

print("Synthetic dataset generated and saved to synthetic_dataset.csv.")
