<a href="https://colab.research.google.com/github/marty916/AI-Training-Colab-Notebooks/blob/main/Generate_Synthetic_Healthcare_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Load the Pima Indians Diabetes dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"
columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
           "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv(url, names=columns)

# Select relevant features
data = df[["Age", "BloodPressure", "BMI"]].values

# Normalize the data
scaler = MinMaxScaler()
real_data = scaler.fit_transform(data)

# Generator model
def build_generator(latent_dim):
    model = tf.keras.Sequential([
        layers.Dense(16, activation='relu', input_dim=latent_dim),
        layers.Dense(32, activation='relu'),
        layers.Dense(3, activation='sigmoid')  # 3 output features: Age, Blood Pressure, BMI
    ])
    return model

# Discriminator model
def build_discriminator(input_shape):
    model = tf.keras.Sequential([
        layers.Dense(32, activation='relu', input_shape=input_shape),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')  # Output: probability of being real
    ])
    return model

# GAN model
def build_gan(generator, discriminator):
    discriminator.trainable = False  # Freeze the discriminator
    model = tf.keras.Sequential([generator, discriminator])
    return model

# Hyperparameters
latent_dim = 5
epochs = 5000
batch_size = 32

# Build and compile models
generator = build_generator(latent_dim)
discriminator = build_discriminator((3,))
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

gan = build_gan(generator, discriminator)
gan.compile(optimizer='adam', loss='binary_crossentropy')

# Training the GAN
for epoch in range(epochs):
    # Generate latent points
    latent_points = np.random.randn(batch_size, latent_dim)

    # Generate synthetic data
    fake_data = generator.predict(latent_points)

    # Select a random batch of real data
    idx = np.random.randint(0, real_data.shape[0], batch_size)
    real_batch = real_data[idx]

    # Create labels for real and fake data
    real_labels = np.ones((batch_size, 1))
    fake_labels = np.zeros((batch_size, 1))

    # Train the discriminator
    d_loss_real = discriminator.train_on_batch(real_batch, real_labels)
    d_loss_fake = discriminator.train_on_batch(fake_data, fake_labels)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # Train the generator via the GAN model
    misleading_labels = np.ones((batch_size, 1))  # We want the generator to produce 'real' data
    g_loss = gan.train_on_batch(latent_points, misleading_labels)

    # Print progress every 1000 epochs
    if (epoch + 1) % 1000 == 0:
        print(f'Epoch: {epoch + 1}, Discriminator Loss: {d_loss[0]}, Generator Loss: {g_loss}')

# Generate synthetic data with the trained generator
latent_points = np.random.randn(100, latent_dim)
synthetic_data = generator.predict(latent_points)

# De-normalize the synthetic data
synthetic_data = scaler.inverse_transform(synthetic_data)

# Convert to DataFrame for better readability
synthetic_data_df = pd.DataFrame(synthetic_data, columns=['Age', 'Blood Pressure', 'BMI'])

# Display the first few rows of the generated data
synthetic_data_df.head()

# Plotting the real vs. synthetic data distributions
plt.figure(figsize=(15, 5))
for i, column in enumerate(synthetic_data_df.columns):
    plt.subplot(1, 3, i+1)
    plt.hist(real_data[:, i], bins=30, alpha=0.5, label='Real')
    plt.hist(synthetic_data[:, i], bins=30, alpha=0.5, label='Synthetic')
    plt.title(column)
    plt.legend()
plt.show()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 531ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18