In [1]:
# Step 1: Data Preprocessing
import pandas as pd

# Load the dataset
data = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')
data.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [2]:
# Check for missing values
print(data.isnull().sum())

# Handle missing values (assuming forward fill for simplicity)
data.fillna(method='ffill', inplace=True)

Diabetes_012            0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


In [3]:
# Handle missing values (assuming forward fill for simplicity)
data.fillna(method='ffill', inplace=True)

# Drop any remaining NaN values if present
data.dropna(inplace=True)

In [4]:
# Encode categorical variables
data = pd.get_dummies(data, columns=['Sex', 'Education'])


In [5]:
# Scale numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[['BMI', 'MentHlth', 'PhysHlth']] = scaler.fit_transform(data[['BMI', 'MentHlth', 'PhysHlth']])

# Step 2: Exploratory Data Analysis (EDA)
# Conduct exploratory data analysis to gain insights into the dataset
print(data.describe())

        Diabetes_012         HighBP       HighChol      CholCheck  \
count  253680.000000  253680.000000  253680.000000  253680.000000   
mean        0.296921       0.429001       0.424121       0.962670   
std         0.698160       0.494934       0.494210       0.189571   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       1.000000   
50%         0.000000       0.000000       0.000000       1.000000   
75%         0.000000       1.000000       1.000000       1.000000   
max         2.000000       1.000000       1.000000       1.000000   

                BMI         Smoker         Stroke  HeartDiseaseorAttack  \
count  2.536800e+05  253680.000000  253680.000000         253680.000000   
mean  -2.505162e-16       0.443169       0.040571              0.094186   
std    1.000002e+00       0.496761       0.197294              0.292087   
min   -2.478916e+00       0.000000       0.000000              0.000000   
25%

In [6]:
# Step 3: Model Selection
from sklearn.ensemble import GradientBoostingClassifier

# Step 4: Baseline Model Training
X = data.drop(columns=['Diabetes_012'])
y = data['Diabetes_012']
model_baseline = GradientBoostingClassifier()
model_baseline.fit(X, y)

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Define the dimensions of the input noise vector
latent_dim = 100

# Define the generator model
def build_generator(latent_dim):
    input_noise = Input(shape=(latent_dim,))
    x = Dense(128, activation='relu')(input_noise)
    x = Dense(256, activation='relu')(x)
    x = Dense(512, activation='relu')(x)
    output = Dense(data.shape[1], activation='sigmoid')(x)  # Output layer matches the dimension of your dataset
    generator = Model(inputs=input_noise, outputs=output)
    return generator

# Define the discriminator model
def build_discriminator(input_shape):
    input_data = Input(shape=input_shape)
    x = Dense(512, activation='relu')(input_data)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)
    discriminator = Model(inputs=input_data, outputs=output)
    discriminator.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0002, beta_1=0.5), metrics=['accuracy'])
    return discriminator

# Compile the discriminator
discriminator = build_discriminator(input_shape=(data.shape[1],))
discriminator.trainable = False

# Build the GAN model
generator = build_generator(latent_dim)
gan_input = Input(shape=(latent_dim,))
synthetic_data = generator(gan_input)
gan_output = discriminator(synthetic_data)
gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0002, beta_1=0.5))

# Training the GAN
def train_gan(data, generator, discriminator, gan, latent_dim, epochs=10000, batch_size=128):
    for epoch in range(epochs):
        # Sample random noise for generator input
        noise = np.random.normal(0, 1, (batch_size, latent_dim))

        # Generate synthetic data
        generated_data = generator.predict(noise)

        # Select a random batch of real data
        idx = np.random.randint(0, data.shape[0], batch_size)
        real_data = data[idx]

        # Train the discriminator
        d_loss_real = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))
        d_loss_fake = discriminator.train_on_batch(generated_data, np.zeros((batch_size, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train the generator (via the GAN model)
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        valid_labels = np.ones((batch_size, 1))
        g_loss = gan.train_on_batch(noise, valid_labels)

        # Print progress
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Discriminator Loss: {d_loss[0]}, Generator Loss: {g_loss}")

# Train the GAN
train_gan(data.values.astype('float32'), generator, discriminator, gan, latent_dim)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 5100, Discriminator Loss: 1.1355839897325936e-05, Generator Loss: 10.716450691223145
Epoch 5200, Discriminator Loss: 6.74907794941652e-06, Generator Loss: 11.22681713104248
Epoch 5300, Discriminator Loss: 5.368935946376855e-06, Generator Loss: 11.455483436584473
Epoch 5400, Discriminator Loss: 2.1728912449597154e-06, Generator Loss: 12.349729537963867
Epoch 5500, Discriminator Loss: 1.5532338071338949e-06, Generator Loss: 12.684757232666016
Epoch 5600, Discriminator Loss: 1.9952030492756325e-06, Generator Loss: 12.435912132263184
Epoch 5700, Discriminator Loss: 3.1620231500096272e-06, Generator Loss: 11.97736930847168
Epoch 5800, Discriminator Loss: 5.6703056543589905e-06, Generator Loss: 11.398151397705078
Epoch 5900, Discriminator Loss: 2.5639928954812135e-05, Generator Loss: 9.93591022491455
Epoch 6000, Discriminator Loss: 9.085367310035508e-06, Generator Loss: 10.926456451416016
Epoch 6100, Discriminator Loss: 0

In [8]:
# Define X_train and y_train for the baseline model
X_train, y_train = data.drop(columns=['Diabetes_012']), data['Diabetes_012']

In [10]:
# Step 6: Model Training with Synthetic Data
# Generate synthetic data using the trained generator model

# Define the number of synthetic samples to generate
num_synthetic_samples = len(data)  # Generate as many synthetic samples as the original data

# Generate random noise for the generator input
noise = np.random.normal(0, 1, (num_synthetic_samples, latent_dim))

# Generate synthetic data
synthetic_data = generator.predict(noise)

# Combine original and synthetic data
augmented_data = np.concatenate([data.values, synthetic_data], axis=0)

# Create labels for the augmented dataset
augmented_labels = np.concatenate([np.ones(len(data)), np.zeros(len(synthetic_data))])

# Shuffle the augmented dataset
shuffled_indices = np.random.permutation(len(augmented_data))
augmented_data = augmented_data[shuffled_indices]
augmented_labels = augmented_labels[shuffled_indices]

# Split the augmented dataset into features and labels
X_augmented = augmented_data
y_augmented = augmented_labels

# Train your machine learning model using the augmented dataset
# Example:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split the augmented dataset into train and test sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(X_augmented, y_augmented, test_size=0.2, random_state=42)

# Define and train your machine learning model (e.g., RandomForestClassifier)
model_augmented = RandomForestClassifier()
model_augmented.fit(X_train_augmented, y_train_augmented)

# Evaluate the model on the test set
accuracy_augmented = model_augmented.score(X_test_augmented, y_test_augmented)
print("Accuracy on augmented data:", accuracy_augmented)

Accuracy on augmented data: 1.0


In [11]:
# Step 6: Model Training with Synthetic Data

# Assuming you have trained the GAN and generated synthetic_data
# Combine original and synthetic data
augmented_data = np.concatenate([data.values, synthetic_data], axis=0)

# Create labels for the augmented dataset
augmented_labels = np.concatenate([np.ones(len(data)), np.zeros(len(synthetic_data))])

# Shuffle the augmented dataset
shuffled_indices = np.random.permutation(len(augmented_data))
augmented_data = augmented_data[shuffled_indices]
augmented_labels = augmented_labels[shuffled_indices]

# Split the augmented dataset into features and labels
X_augmented = augmented_data
y_augmented = augmented_labels

# Train your machine learning model using the augmented dataset
# Example:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split the augmented dataset into train and test sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(X_augmented, y_augmented, test_size=0.2, random_state=42)

# Define and train your machine learning model (e.g., RandomForestClassifier)
model_augmented = RandomForestClassifier()
model_augmented.fit(X_train_augmented, y_train_augmented)

# Step 7: Model Evaluation
# Evaluate the performance of the trained models using appropriate evaluation metrics
# Compare the performance of models trained with and without synthetic data

# Assuming you have defined X_test and y_test using the original data
# Evaluate the baseline model on the original test set
X_test, y_test = data.drop(columns=['Diabetes_012']), data['Diabetes_012']
accuracy_baseline = model_baseline.score(X_test, y_test)
print("Baseline Model Accuracy:", accuracy_baseline)

# Evaluate the model trained with augmented data on the augmented test set
accuracy_augmented = model_augmented.score(X_test_augmented, y_test_augmented)
print("Model Trained with Augmented Data Accuracy:", accuracy_augmented)

# Compare the performance of the baseline and augmented models
if accuracy_augmented > accuracy_baseline:
    print("Model trained with augmented data performs better.")
else:
    print("Baseline model performs better.")


Baseline Model Accuracy: 0.8501024913276569
Model Trained with Augmented Data Accuracy: 1.0
Model trained with augmented data performs better.


In [12]:
# Step 8: Hyperparameter Tuning
# Perform hyperparameter tuning to optimize the performance of the models
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the hyperparameters grid (reduced for faster execution)
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
}

# Initialize GridSearchCV for baseline model
grid_search_baseline = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=3)
grid_search_baseline.fit(X_train, y_train)

# Initialize GridSearchCV for model trained on augmented data
grid_search_augmented = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=3)
grid_search_augmented.fit(X_train_augmented, y_train_augmented)

# Get the best hyperparameters and models
best_params_baseline = grid_search_baseline.best_params_
best_model_baseline = grid_search_baseline.best_estimator_

best_params_augmented = grid_search_augmented.best_params_
best_model_augmented = grid_search_augmented.best_estimator_

# Evaluate the best models on the test set
accuracy_baseline = best_model_baseline.score(X_test, y_test)
accuracy_augmented = best_model_augmented.score(X_test_augmented, y_test_augmented)

print("Best Model Accuracy (Baseline):", accuracy_baseline)
print("Best Model Hyperparameters (Baseline):", best_params_baseline)
print("Best Model Accuracy (Augmented Data):", accuracy_augmented)
print("Best Model Hyperparameters (Augmented Data):", best_params_augmented)


Best Model Accuracy (Baseline): 0.8513363292336802
Best Model Hyperparameters (Baseline): {'max_depth': 10, 'n_estimators': 50}
Best Model Accuracy (Augmented Data): 1.0
Best Model Hyperparameters (Augmented Data): {'max_depth': None, 'n_estimators': 50}


In [13]:
from sklearn.model_selection import cross_val_score

# Step 9: Validation and Interpretation
# Validate the trained models using cross-validation or a holdout validation set
# Interpret the results and assess the clinical relevance of the model predictions

# Validate the baseline model using cross-validation
baseline_scores = cross_val_score(best_model_baseline, X_train, y_train, cv=5)
print("Baseline Model Cross-Validation Scores:", baseline_scores)
print("Baseline Model Mean Cross-Validation Score:", baseline_scores.mean())

# Validate the model trained on augmented data using cross-validation
augmented_scores = cross_val_score(best_model_augmented, X_train_augmented, y_train_augmented, cv=5)
print("Augmented Data Model Cross-Validation Scores:", augmented_scores)
print("Augmented Data Model Mean Cross-Validation Score:", augmented_scores.mean())



Baseline Model Cross-Validation Scores: [0.84843109 0.84850993 0.84766241 0.84945601 0.84850993]
Baseline Model Mean Cross-Validation Score: 0.848513875748975
Augmented Data Model Cross-Validation Scores: [1. 1. 1. 1. 1.]
Augmented Data Model Mean Cross-Validation Score: 1.0
