In [20]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
import plotly.io as pio
from itertools import combinations

from fairlearn.datasets import fetch_diabetes_hospital
from sklearn.model_selection import train_test_split
import sdmetrics
from sdmetrics.reports.single_table import QualityReport, DiagnosticReport
from sdmetrics.single_column import (
    KSComplement,
    TVComplement,
    CategoryCoverage,
    RangeCoverage,
    MissingValueSimilarity,
    StatisticSimilarity,
    BoundaryAdherence,
    CategoryAdherence,
)
from sdmetrics.column_pairs import CorrelationSimilarity, ContingencySimilarity
from sdmetrics.single_table import NewRowSynthesis, TableStructure
from sdmetrics.visualization import get_column_plot, get_column_pair_plot

In [2]:
data = fetch_diabetes_hospital(as_frame=True)
X = data.data.copy()
y = data.target.copy()

dropped_columns = [c for c in ["readmitted", "readmit_binary"] if c in X.columns]
X = X.drop(columns=dropped_columns)

real_data = X.copy()
real_data["readmit_binary"] = (y == 1)  # boolean target

real_train, real_test = train_test_split(
    real_data,
    test_size=0.2,
    random_state=66,
    stratify=real_data["readmit_binary"]
)

real_train = real_train.reset_index(drop=True)
real_test  = real_test.reset_index(drop=True)

real_train.shape, real_test.shape, real_train["readmit_binary"].dtype

((81412, 23), (20354, 23), dtype('bool'))

In [3]:
ARTIFACT_META_PATH = Path("../artifacts/diabetes_metadata.json")

def guess_sdtype_from_series(s: pd.Series) -> str:
    
    if pd.api.types.is_bool_dtype(s):
        return "boolean"
    if pd.api.types.is_datetime64_any_dtype(s):
        return "datetime"
    if pd.api.types.is_numeric_dtype(s):
        return "numerical"
    return "categorical"

def load_or_build_single_table_metadata(df: pd.DataFrame, path: Path | None = None, table_name_hint: str = "diabetes") -> dict:
    
    if path is not None and path.exists():
        with path.open("r") as f:
            meta = json.load(f)

        if "tables" in meta:

            table_name = table_name_hint if table_name_hint in meta["tables"] else next(iter(meta["tables"]))
            meta = meta["tables"][table_name]

        if "columns" in meta:
            return meta

    meta = {"columns": {}}
    for col in df.columns:
        meta["columns"][col] = {"sdtype": guess_sdtype_from_series(df[col])}

    for col in ["race", "gender"]:
        if col in meta["columns"]:
            meta["columns"][col]["sdtype"] = "categorical"
    if "readmit_binary" in meta["columns"]:
        meta["columns"]["readmit_binary"]["sdtype"] = "boolean"

    return meta

metadata = load_or_build_single_table_metadata(real_train, ARTIFACT_META_PATH)
list(metadata["columns"].items())[:5], metadata["columns"]["readmit_binary"]

([('race', {'sdtype': 'categorical'}),
  ('gender', {'sdtype': 'categorical'}),
  ('age', {'sdtype': 'categorical'}),
  ('discharge_disposition_id', {'sdtype': 'id'}),
  ('admission_source_id', {'sdtype': 'id'})],
 {'sdtype': 'boolean'})

In [4]:
def coerce_boolean(series: pd.Series) -> pd.Series:
    s = series.copy()

    if pd.api.types.is_bool_dtype(s):
        return s.astype("boolean")

    mapping = {0: False, 1: True, "0": False, "1": True, False: False, True: True}
    s2 = s.map(mapping)

    
    if s2.notna().mean() < 0.9 and s.notna().mean() > 0:
        s2 = pd.to_numeric(s, errors="coerce").map(lambda x: np.nan if pd.isna(x) else bool(int(x)))

    return s2.astype("boolean")

def sanitize_for_sdmetrics(df: pd.DataFrame, meta: dict) -> pd.DataFrame:
    out = df.copy()

    for col, info in meta.get("columns", {}).items():
        if col not in out.columns:
            continue
        sdtype = info.get("sdtype")

        if sdtype == "boolean":
            out[col] = coerce_boolean(out[col])

        elif sdtype in ("categorical", "id", "other"):
            out[col] = out[col].astype("object")

        elif sdtype == "numerical":
            out[col] = pd.to_numeric(out[col], errors="coerce")

        elif sdtype == "datetime":
            out[col] = pd.to_datetime(out[col], errors="coerce")

    return out

real_train_s = sanitize_for_sdmetrics(real_train, metadata)
real_test_s  = sanitize_for_sdmetrics(real_test, metadata)

real_train_s.dtypes.head(10), real_train_s["readmit_binary"].dtype

(race                        object
 gender                      object
 age                         object
 discharge_disposition_id    object
 admission_source_id         object
 time_in_hospital             int64
 medical_specialty           object
 num_lab_procedures           int64
 num_procedures              object
 num_medications              int64
 dtype: object,
 BooleanDtype)

In [10]:
def load_model_syn_data(model_path, sample_len):
    if model_path.exists():
        with model_path.open("rb") as f:
            model = pickle.load(f)
        synthetic_dataset = model.sample(num_rows=sample_len)
        return model, synthetic_dataset

In [5]:
gc_path = Path("../artifacts/gaussian_copuula_diabetes.pkl")
ct_path = Path("../artifacts/ctgan_diabetes.pkl")
tv_path = Path("../artifacts/tvae_diabetes.pkl")

In [11]:
sample_len = len(real_train_s)
gc_model, gc_gendata = load_model_syn_data(gc_path, sample_len)
gc_gendata.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,A1Cresult,insulin,change,diabetesMed,medicare,medicaid,had_emergency,had_inpatient_days,had_outpatient_days,readmit_binary
0,Caucasian,Male,'Over 60 years','Discharged to Home',Emergency,4,Missing,54,3,9,...,,Steady,Ch,Yes,True,False,False,True,False,False
1,Caucasian,Male,'30 years or younger','Discharged to Home',Emergency,6,Family/GeneralPractice,29,0,15,...,,No,Ch,Yes,False,False,False,False,True,False
2,Caucasian,Male,'30-60 years','Discharged to Home',Emergency,4,Emergency/Trauma,11,0,5,...,,Steady,No,Yes,True,False,False,False,False,False
3,Caucasian,Male,'30-60 years','Discharged to Home',Emergency,2,Missing,6,6,8,...,,No,Ch,No,False,False,False,True,False,True
4,Unknown,Female,'Over 60 years','Discharged to Home',Other,6,InternalMedicine,53,0,27,...,,No,Ch,Yes,False,False,False,False,True,False


In [12]:
ct_model, ct_gendata = load_model_syn_data(ct_path, sample_len)
ct_gendata.head()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,A1Cresult,insulin,change,diabetesMed,medicare,medicaid,had_emergency,had_inpatient_days,had_outpatient_days,readmit_binary
0,AfricanAmerican,Female,'Over 60 years','Discharged to Home',Referral,3,Missing,19,5,6,...,,Steady,No,Yes,True,False,False,False,False,False
1,Caucasian,Female,'Over 60 years',Other,Emergency,6,InternalMedicine,62,0,21,...,,No,No,Yes,True,False,False,False,False,False
2,Hispanic,Male,'30-60 years','Discharged to Home',Emergency,4,Missing,42,2,32,...,,Up,Ch,No,False,False,False,False,False,False
3,Caucasian,Female,'Over 60 years','Discharged to Home',Emergency,2,Missing,46,1,9,...,,No,No,Yes,True,False,True,False,False,True
4,Caucasian,Female,'30-60 years','Discharged to Home',Referral,4,Family/GeneralPractice,40,0,23,...,,No,Ch,Yes,True,False,False,False,False,False


In [13]:
tv_model, tv_gendata = load_model_syn_data(tv_path, sample_len)
tv_gendata.head()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,A1Cresult,insulin,change,diabetesMed,medicare,medicaid,had_emergency,had_inpatient_days,had_outpatient_days,readmit_binary
0,Caucasian,Female,'30-60 years',Other,Emergency,6,Other,50,0,16,...,,No,No,No,True,False,False,False,False,False
1,Caucasian,Female,'Over 60 years',Other,Emergency,2,Missing,3,0,9,...,,No,No,No,False,False,False,False,False,False
2,AfricanAmerican,Male,'30-60 years','Discharged to Home',Emergency,5,Missing,50,3,10,...,,Steady,No,Yes,False,False,False,False,False,False
3,Caucasian,Female,'30-60 years','Discharged to Home',Emergency,8,Other,41,0,18,...,,Steady,No,Yes,False,False,False,False,False,False
4,Caucasian,Female,'30-60 years',Other,Emergency,2,Family/GeneralPractice,43,0,12,...,,No,No,No,False,False,False,True,False,False


In [14]:
syn_datasets = {
    "GaussianCopula" : gc_gendata,
    "CTGAN" : ct_gendata,
    "TVAE" : tv_gendata,
}

In [15]:
def align_like_real(real_df: pd.DataFrame, syn_df: pd.DataFrame) -> pd.DataFrame:
    syn = syn_df.copy()
    extra = [c for c in syn.columns if c not in real_df.columns]
    if extra:
        syn = syn.drop(columns=extra)

    missing = [c for c in real_df.columns if c not in syn.columns]
    for c in missing:
        syn[c] = np.nan
    return syn[real_df.columns]

syn_datas_s = {}
for name, syn_df in syn_datasets.items():
    syn_aligned = align_like_real(real_train_s, syn_df)
    syn_datas_s[name] = sanitize_for_sdmetrics(syn_aligned, metadata)

{k: v.shape for k, v in syn_datas_s.items()}

{'GaussianCopula': (81412, 23), 'CTGAN': (81412, 23), 'TVAE': (81412, 23)}

In [16]:
OUT_DIR = Path("../reports/sdmetrics")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def run_reports_for_one(name: str, real_df: pd.DataFrame, syn_df: pd.DataFrame, meta: dict, verbose: bool = False):
    
    diag = DiagnosticReport()
    diag.generate(real_df, syn_df, meta, verbose=verbose)

    qual = QualityReport()
    qual.generate(real_df, syn_df, meta, verbose=verbose)

    return diag, qual

all_reports = {}
summary_rows = []

for name, syn_df in syn_datas_s.items():
    diag, qual = run_reports_for_one(name, real_train_s, syn_df, metadata, verbose=False)

    diag_path = OUT_DIR / f"{name}_diagnostic_report.pkl"
    qual_path = OUT_DIR / f"{name}_quality_report.pkl"
    diag.save(str(diag_path))
    qual.save(str(qual_path))

    diag_score = float(diag.get_score())
    qual_score = float(qual.get_score())

    diag_props = diag.get_properties()
    qual_props = qual.get_properties()

    if isinstance(diag_props, pd.DataFrame):
        diag_props_dict = dict(zip(diag_props["Property"], diag_props["Score"]))
    else:
        diag_props_dict = dict(diag_props)

    qual_props_dict = dict(zip(qual_props["Property"], qual_props["Score"]))

    all_reports[name] = {"diagnostic": diag, "quality": qual}

    summary_rows.append({
        "model": name,
        "diagnostic_score": diag_score,
        "quality_score": qual_score,
        "diag_data_validity": float(diag_props_dict.get("Data Validity", np.nan)),
        "diag_data_structure": float(diag_props_dict.get("Data Structure", np.nan)),
        "qual_column_shapes": float(qual_props_dict.get("Column Shapes", np.nan)),
        "qual_column_pair_trends": float(qual_props_dict.get("Column Pair Trends", np.nan)),
    })

summary_df = pd.DataFrame(summary_rows).sort_values("quality_score", ascending=False)
summary_df

  real_data = real_data.fillna(np.nan)
  synthetic_data = synthetic_data.fillna(np.nan)
  real_data = real_data.fillna(np.nan)
  synthetic_data = synthetic_data.fillna(np.nan)
  real_data = real_data.fillna(np.nan)
  synthetic_data = synthetic_data.fillna(np.nan)


Unnamed: 0,model,diagnostic_score,quality_score,diag_data_validity,diag_data_structure,qual_column_shapes,qual_column_pair_trends
2,TVAE,1.0,0.936193,1.0,1.0,0.946341,0.926045
0,GaussianCopula,1.0,0.89637,1.0,1.0,0.979264,0.813475
1,CTGAN,1.0,0.875933,1.0,1.0,0.899921,0.851945


In [18]:
for name in syn_datas_s.keys():
    diag = all_reports[name]["diagnostic"]
    qual = all_reports[name]["quality"]

    diag_validity = diag.get_details("Data Validity")
    qual_shapes   = qual.get_details("Column Shapes")
    qual_pairs    = qual.get_details("Column Pair Trends")

    diag_validity.to_csv(OUT_DIR / f"{name}_diag_data_validity.csv", index=False)
    qual_shapes.to_csv(OUT_DIR / f"{name}_qual_column_shapes.csv", index=False)
    qual_pairs.to_csv(OUT_DIR / f"{name}_qual_column_pair_trends.csv", index=False)

    fig_diag = diag.get_visualization("Data Validity")
    fig_shapes = qual.get_visualization("Column Shapes")
    fig_pairs  = qual.get_visualization("Column Pair Trends")

    fig_diag.write_html(OUT_DIR / f"{name}_diag_data_validity.html", include_plotlyjs="cdn")
    fig_shapes.write_html(OUT_DIR / f"{name}_qual_column_shapes.html", include_plotlyjs="cdn")
    fig_pairs.write_html(OUT_DIR / f"{name}_qual_column_pair_trends.html", include_plotlyjs="cdn")

print("Saved report artifacts to:", OUT_DIR.resolve())

Saved report artifacts to: /home/ics-home/capstone_project/capstone_project/reports/sdmetrics


In [19]:
def compute_column_metric_safe(metric_cls, real_col, syn_col, **kwargs) -> float:
    try:
        return float(metric_cls.compute(real_data=real_col, synthetic_data=syn_col, **kwargs))
    except TypeError:
        try:
            return float(metric_cls.compute(real_col, syn_col, **kwargs))
        except Exception:
            return np.nan
    except Exception:
        return np.nan

def per_column_metrics(real_df: pd.DataFrame, syn_df: pd.DataFrame, meta: dict) -> pd.DataFrame:
    rows = []
    for col, info in meta["columns"].items():
        if col not in real_df.columns or col not in syn_df.columns:
            continue

        sdtype = info.get("sdtype")
        r = real_df[col]
        s = syn_df[col]

        rows.append({
            "column": col,
            "sdtype": sdtype,
            "metric": "MissingValueSimilarity",
            "score": compute_column_metric_safe(MissingValueSimilarity, r, s),
        })

        if sdtype in ("numerical", "datetime"):
            rows.extend([
                {"column": col, "sdtype": sdtype, "metric": "KSComplement", "score": compute_column_metric_safe(KSComplement, r, s)},
                {"column": col, "sdtype": sdtype, "metric": "RangeCoverage", "score": compute_column_metric_safe(RangeCoverage, r, s)},
                {"column": col, "sdtype": sdtype, "metric": "BoundaryAdherence", "score": compute_column_metric_safe(BoundaryAdherence, r, s)},
                {"column": col, "sdtype": sdtype, "metric": "StatisticSimilarity(mean)", "score": compute_column_metric_safe(StatisticSimilarity, r, s, statistic="mean")},
                {"column": col, "sdtype": sdtype, "metric": "StatisticSimilarity(median)", "score": compute_column_metric_safe(StatisticSimilarity, r, s, statistic="median")},
                {"column": col, "sdtype": sdtype, "metric": "StatisticSimilarity(std)", "score": compute_column_metric_safe(StatisticSimilarity, r, s, statistic="std")},
            ])

        elif sdtype in ("categorical", "boolean"):
            rows.extend([
                {"column": col, "sdtype": sdtype, "metric": "TVComplement", "score": compute_column_metric_safe(TVComplement, r, s)},
                {"column": col, "sdtype": sdtype, "metric": "CategoryCoverage", "score": compute_column_metric_safe(CategoryCoverage, r, s)},
                {"column": col, "sdtype": sdtype, "metric": "CategoryAdherence", "score": compute_column_metric_safe(CategoryAdherence, r, s)},
            ])

    return pd.DataFrame(rows)

all_column_metrics = {}
for name, syn_df in syn_datas_s.items():
    dfm = per_column_metrics(real_train_s, syn_df, metadata)
    all_column_metrics[name] = dfm
    dfm.to_csv(OUT_DIR / f"{name}_individual_column_metrics.csv", index=False)

all_column_metrics["GaussianCopula"].head(10)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True

Unnamed: 0,column,sdtype,metric,score
0,race,categorical,MissingValueSimilarity,1.0
1,race,categorical,TVComplement,0.994976
2,race,categorical,CategoryCoverage,1.0
3,race,categorical,CategoryAdherence,1.0
4,gender,categorical,MissingValueSimilarity,1.0
5,gender,categorical,TVComplement,0.999656
6,gender,categorical,CategoryCoverage,1.0
7,gender,categorical,CategoryAdherence,1.0
8,age,categorical,MissingValueSimilarity,1.0
9,age,categorical,TVComplement,0.998968


In [None]:
def infer_cols_from_metadata(meta: dict):
    num_cols, cat_cols = [], []
    for col, info in meta["columns"].items():
        sdtype = info.get("sdtype")
        if sdtype in ("numerical", "datetime"):
            num_cols.append(col)
        elif sdtype in ("categorical", "boolean"):
            cat_cols.append(col)
    return num_cols, cat_cols

def compute_pair_metric_safe(metric_cls, real_df2, syn_df2, **kwargs) -> float:
    try:
        return float(metric_cls.compute(real_data=real_df2, synthetic_data=syn_df2, **kwargs))
    except TypeError:
        try:
            return float(metric_cls.compute(real_df2, syn_df2, **kwargs))
        except Exception:
            return np.nan
    except Exception:
        return np.nan

def per_pair_metrics(real_df: pd.DataFrame, syn_df: pd.DataFrame, meta: dict, subsample_rows: int = 5000, random_state: int = 0):
    num_cols, cat_cols = infer_cols_from_metadata(meta)

    if len(real_df) > subsample_rows:
        real_use = real_df.sample(subsample_rows, random_state=random_state)
    else:
        real_use = real_df

    if len(syn_df) > subsample_rows:
        syn_use = syn_df.sample(subsample_rows, random_state=random_state)
    else:
        syn_use = syn_df

    corr_rows = []
    for a, b in combinations(num_cols, 2):
        score = compute_pair_metric_safe(CorrelationSimilarity, real_use[[a, b]], syn_use[[a, b]])
        corr_rows.append({"col_a": a, "col_b": b, "metric": "CorrelationSimilarity", "score": score})

    cont_rows = []
    for a, b in combinations(cat_cols, 2):
        score = compute_pair_metric_safe(ContingencySimilarity, real_use[[a, b]], syn_use[[a, b]])
        cont_rows.append({"col_a": a, "col_b": b, "metric": "ContingencySimilarity", "score": score})

    return pd.DataFrame(corr_rows).sort_values("score"), pd.DataFrame(cont_rows).sort_values("score")

all_pair_metrics = {}
for name, syn_df in syn_datas_s.items():
    corr_df, cont_df = per_pair_metrics(real_train_s, syn_df, metadata, subsample_rows=5000)
    all_pair_metrics[name] = {"correlation": corr_df, "contingency": cont_df}
    corr_df.to_csv(OUT_DIR / f"{name}_pair_correlation_similarity.csv", index=False)
    cont_df.to_csv(OUT_DIR / f"{name}_pair_contingency_similarity.csv", index=False)

all_pair_metrics["GaussianCopula"]["correlation"].head(10)

Unnamed: 0,col_a,col_b,metric,score
1,time_in_hospital,num_medications,CorrelationSimilarity,0.982495
0,time_in_hospital,num_lab_procedures,CorrelationSimilarity,0.985643
4,num_lab_procedures,number_diagnoses,CorrelationSimilarity,0.986576
5,num_medications,number_diagnoses,CorrelationSimilarity,0.986659
2,time_in_hospital,number_diagnoses,CorrelationSimilarity,0.987254
3,num_lab_procedures,num_medications,CorrelationSimilarity,0.991359
