In [4]:
import pandas as pd
import numpy as np
import os

from fairlearn.datasets import fetch_diabetes_hospital
from sklearn.model_selection import train_test_split
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.evaluation.single_table import get_column_plot

In [5]:
data = fetch_diabetes_hospital(as_frame=True)

X = data.data.copy()
y = data.target.copy()

X.shape, y.shape

((101766, 24), (101766,))

In [6]:
dropped_columns = ['readmitted', 'readmit_binary']
X = X.drop(columns=dropped_columns)

real_data = X.copy()
real_data['readmit_binary'] = (y == 1)
real_data['readmit_binary'] = real_data['readmit_binary'].astype(bool)
real_data.shape, real_data['readmit_binary'].dtype

((101766, 23), dtype('bool'))

In [7]:
real_train, real_test = train_test_split(
    real_data,
    test_size=0.2,
    random_state=66,
    stratify=real_data['readmit_binary']
)

real_train = real_train.reset_index(drop=True)
real_test = real_test.reset_index(drop=True)

real_train.shape, real_test.shape

((81412, 23), (20354, 23))

In [8]:
cat_cols = real_train.select_dtypes(include=['category']).columns
real_train[cat_cols] = real_train[cat_cols].astype('object')
real_test[cat_cols] = real_test[cat_cols].astype('object')

real_train['readmit_binary'] = real_train['readmit_binary'].astype(bool)
real_test['readmit_binary'] = real_test['readmit_binary'].astype(bool)

In [9]:
metadata = Metadata.detect_from_dataframe(
    data=real_train,
    table_name='diabetes'
)

In [10]:
meta_dict = metadata.to_dict()
columns = list(meta_dict['tables']['diabetes']['columns'].keys())
for col in columns[:]:
    print(col, "->", meta_dict['tables']['diabetes']['columns'][col].get('sdtype'))

race -> categorical
gender -> categorical
age -> categorical
discharge_disposition_id -> id
admission_source_id -> id
time_in_hospital -> numerical
medical_specialty -> categorical
num_lab_procedures -> numerical
num_procedures -> categorical
num_medications -> numerical
primary_diagnosis -> categorical
number_diagnoses -> numerical
max_glu_serum -> categorical
A1Cresult -> categorical
insulin -> categorical
change -> categorical
diabetesMed -> categorical
medicare -> categorical
medicaid -> categorical
had_emergency -> categorical
had_inpatient_days -> categorical
had_outpatient_days -> categorical
readmit_binary -> categorical


In [11]:
sensitive_attributes = ['race', 'gender']
for col in sensitive_attributes:
    if col in real_train.columns:
        metadata.update_column(column_name=col, sdtype='categorical')
metadata.update_column(column_name='readmit_binary', sdtype='boolean')
metadata.validate()

In [12]:
baseline_models = {
    "gaussian_copuula" : GaussianCopulaSynthesizer(
        metadata,
        enforce_min_max_values=True,
        enforce_rounding=True,
    ),
    "ctgan" : CTGANSynthesizer(
        metadata,
        epochs=500,
        verbose=True,
        enforce_rounding=False,
    ),
    "tvae" : TVAESynthesizer(
        metadata,
        epochs=500,
        verbose=True,
        enforce_rounding=False,
    ),
}



In [13]:
synthetic_train = {}
for name, model in baseline_models.items():
    print(f"\n --- training {name} ---")
    model.fit(real_train)

    synthetic_train[name] = model.sample(num_rows=len(real_train))

    os.makedirs("../artifacts", exist_ok=True)
    model.save(f"../artifacts/{name}_diabetes.pkl")
    print(f"Saved: ../artifacts/{name}_diabetes.pkl")

metadata.save_to_json('../artifacts/diabetes_metadata.json')


 --- training gaussian_copuula ---
Saved: ../artifacts/gaussian_copuula_diabetes.pkl

 --- training ctgan ---


Gen. (-03.56) | Discrim. (-00.34): 100%|██████████| 500/500 [2:44:29<00:00, 19.74s/it]


Saved: ../artifacts/ctgan_diabetes.pkl

 --- training tvae ---


Loss: -02.10: 100%|██████████| 500/500 [2:55:56<00:00, 21.11s/it]


Saved: ../artifacts/tvae_diabetes.pkl


In [14]:
def diag_score(syn_data, real_train, metadata):
  diagnostic = run_diagnostic(
      real_data=real_train,
      synthetic_data=syn_data,
      metadata=metadata
  )
  return diagnostic.get_score()

In [18]:
def quality_score(syn_data, real_train, metadata):
  quality_report = evaluate_quality(
      real_data=real_train,
      synthetic_data=syn_data,
      metadata=metadata
  )
  return quality_report.get_score

In [19]:
for name, syn_data in synthetic_train.items():
  print(f"--{name}--:")
  print(f"\n", diag_score(syn_data, real_train, metadata))
  print(f"\n", quality_score(syn_data, real_train, metadata))

--gaussian_copuula--:
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 23/23 [00:00<00:00, 83.67it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 223.08it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%


 1.0
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 23/23 [00:00<00:00, 30.94it/s]|
Column Shapes Score: 97.93%

(2/2) Evaluating Column Pair Trends: |██████████| 253/253 [00:06<00:00, 38.19it/s]|
Column Pair Trends Score: 81.39%

Overall Score (Average): 89.66%


 <bound method BaseReport.get_score of <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x7a010064a2d0>>
--ctgan--:
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 23/23 [00:00<00:00, 86.73it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 192.85it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%


 1.0
Generatin

In [21]:
for name, syn_data in synthetic_train.items():
  print(f"--{name}--:")
  fig = get_column_plot(
      real_data=real_train,
      synthetic_data=syn_data,
      metadata=metadata,
      column_name='race'
  )
  fig.show()

Output hidden; open in https://colab.research.google.com to view.