In [4]:
import numpy as np
import pandas as pd

# ---------------------------------------------------------
# Synthetic kidney dataset generator
#
# - NO real patient rows
# - Encodes realistic kidney patterns:
#     * Dialysis patients -> worse kidney labs
#     * Non-dialysis patients -> closer to normal
# - Safe to publish on GitHub (fully synthetic)
# ---------------------------------------------------------

def sample_labs_for_class(rng, cls):
    """
    Sample lab values conditional on dialysis status.
    cls = 1 -> dialysis
    cls = 0 -> non-dialysis
    All values are synthetic but follow medically-plausible patterns.
    """

    if cls == 1:
        # Dialysis / advanced kidney dysfunction
        bun   = max(rng.normal(70, 20), 0)     # mg/dL
        creat = max(rng.normal(5.5, 1.5), 0)   # mg/dL
        k     = max(rng.normal(5.2, 0.7), 0)   # mmol/L
        phos  = max(rng.normal(5.5, 1.2), 0)   # mg/dL
        hb    = max(rng.normal(10.0, 2.0), 0)  # g/dL (often anemic)
        # Protein / urine: more severe
        urine_protein_24h = max(rng.lognormal(mean=3.5, sigma=0.6), 0)  # higher
        protein_semiquant = rng.choice(["2+", "3+", "4+"], p=[0.3, 0.4, 0.3])
    else:
        # Non-dialysis (mix of healthy + mild CKD)
        bun   = max(rng.normal(20, 8), 0)
        creat = max(rng.normal(1.0, 0.3), 0)
        k     = max(rng.normal(4.2, 0.4), 0)
        phos  = max(rng.normal(3.7, 0.6), 0)
        hb    = max(rng.normal(13.5, 1.5), 0)
        urine_protein_24h = max(rng.lognormal(mean=1.5, sigma=0.7), 0)  # close to low
        protein_semiquant = rng.choice(["neg", "trace", "1+"], p=[0.4, 0.4, 0.2])

    return {
        "BUN": bun,
        "Creat": creat,
        "K": k,
        "Phos": phos,
        "Hb": hb,
        "Urine Protein 24 hrs": urine_protein_24h,
        "Protein": protein_semiquant,
    }


def generate_synthetic_kidney_long(
    n_patients=3000,
    dialysis_rate=0.333,  # ~33% dialysis, 67% non-dialysis
    random_state=42,
):
    """
    Generate a long-format synthetic dataset with the same column structure
    as the original hospital data:

    NationalNo, age, gender, ServiceCode, ServiceName, Answer, ReceptionDate, RequestPartName

    - Each patient has multiple rows (one per lab test).
    - RequestPartName = "دياليز" for dialysis patients, some other department otherwise.
    """

    rng = np.random.default_rng(random_state)

    # Some non-dialysis departments (Persian labels)
    other_deps = [
        "اورژانس",       # emergency
        "داخلی",         # internal medicine
        "قلب",           # cardiology
        "جراحی عمومی",   # general surgery
    ]

    # List of numerical lab tests we will generate
    numeric_tests = [
        "BUN",
        "Creat",
        "K",
        "Phos",
        "Hb",
        "Urine Protein 24 hrs",
    ]

    rows = []

    for pid in range(1, n_patients + 1):
        # Dialysis or not
        cls = 1 if rng.random() < dialysis_rate else 0

        # Basic demographics
        age = int(np.clip(rng.normal(55 if cls == 1 else 45, 15), 18, 90))
        gender = rng.choice(["مرد", "زن"])  # Persian: male / female

        # Department / ward name
        if cls == 1:
            request_part = "دياليز"
        else:
            request_part = rng.choice(other_deps)

        # Sample synthetic labs for this patient
        labs = sample_labs_for_class(rng, cls)

        # For each test, create one row in "long" format
        for service_name, value in labs.items():
            # ServiceCode can be arbitrary but consistent per test
            service_code = {
                "BUN": "T001",
                "Creat": "T002",
                "K": "T003",
                "Phos": "T004",
                "Hb": "T005",
                "Urine Protein 24 hrs": "T006",
                "Protein": "T007",
            }.get(service_name, "T999")

            # Answer is numeric for most tests, string for Protein semi-quantitative
            answer = value

            # Fake reception date range
            reception_date = pd.Timestamp("2023-01-01") + pd.to_timedelta(
                int(rng.integers(0, 365)), unit="D"
            )

            rows.append(
                {
                    "NationalNo": pid,
                    "age": age,
                    "gender": gender,
                    "ServiceCode": service_code,
                    "ServiceName": service_name,
                    "Answer": answer,
                    "ReceptionDate": reception_date,
                    "RequestPartName": request_part,
                }
            )

    df = pd.DataFrame(rows)
    return df


if __name__ == "__main__":
    # Example usage: generate and save ~realistic synthetic data
    df_syn = generate_synthetic_kidney_long(
        n_patients=3000,
        dialysis_rate=0.333,  # about 1/3 dialysis
        random_state=42,
    )
    print("Synthetic dataset shape (long format):", df_syn.shape)
    df_syn.to_excel("kidney_synthetic.xlsx", index=False)
    print("Saved kidney_synthetic.xlsx")


Synthetic dataset shape (long format): (21000, 8)
Saved kidney_synthetic.xlsx
