In [72]:
import numpy as np
import pandas as pd

np.random.seed(42)
n = 5000

df = pd.DataFrame()

In [73]:
# ------------------------------------------------
# 1. Demographics
# ------------------------------------------------
df["age"] = np.random.randint(18, 90, n)
df["gender"] = np.random.choice(["Male", "Female"], n)

df["admission_type"] = np.random.choice(
    ["Elective", "Urgent", "Emergency"],
    n,
    p=[0.4, 0.3, 0.3]
)

df["insurance_type"] = np.random.choice(
    ["Private", "Government", "Self-pay"],
    n,
    p=[0.5, 0.35, 0.15]
)


In [74]:
# ------------------------------------------------
# 2. Clinical Vitals
# ------------------------------------------------
df["bmi"] = np.random.normal(27, 5, n).clip(16, 45)
df["systolic_bp"] = np.random.normal(130, 20, n).clip(90, 200)
df["diastolic_bp"] = np.random.normal(80, 12, n).clip(50, 120)
df["blood_glucose"] = np.random.normal(130, 50, n).clip(70, 350)
df["cholesterol"] = np.random.normal(200, 40, n).clip(120, 350)
df["heart_rate"] = np.random.normal(80, 15, n).clip(50, 150)
df["hemoglobin"] = np.random.normal(13.5, 1.5, n).clip(8, 18)
df["creatinine"] = np.random.normal(1.0, 0.3, n).clip(0.5, 2.5)

In [75]:
# ------------------------------------------------
# 3. Comorbidities
# ------------------------------------------------
df["diabetes"] = (df["blood_glucose"] > 160).astype(int)
df["hypertension"] = (df["systolic_bp"] > 140).astype(int)
df["heart_disease"] = (
    (df["cholesterol"] > 240) &
    (df["age"] > 55)
).astype(int)

df["smoking_status"] = np.random.choice(
    ["Never", "Former", "Current"],
    n,
    p=[0.5, 0.3, 0.2]
)

In [76]:
# ------------------------------------------------
# 4. Hospital Operational Features
# ------------------------------------------------
df["num_conditions"] = (
    df["diabetes"] +
    df["hypertension"] +
    df["heart_disease"] +
    np.random.binomial(2, 0.3, n)
)

df["num_procedures"] = np.random.poisson(1.5, n)
df["num_medications"] = np.random.poisson(5, n)

df["icu_admission"] = np.where(
    (df["admission_type"] == "Emergency") &
    (df["num_conditions"] >= 2),
    1, 0
)

In [77]:
# ------------------------------------------------
# 5. Risk Score (Clinically Inspired)
# ------------------------------------------------
risk_score = (
    (df["age"] > 60).astype(int) * 2 +
    (df["systolic_bp"] > 140).astype(int) * 2 +
    (df["blood_glucose"] > 150).astype(int) * 2 +
    (df["cholesterol"] > 220).astype(int) * 1 +
    (df["creatinine"] > 1.2).astype(int) * 2 +
    df["num_conditions"] * 2 +
    df["icu_admission"] * 4 +
    (df["admission_type"] == "Emergency").astype(int) * 2
)

df["risk_score"] = risk_score

df["risk_category"] = pd.cut(
    df["risk_score"],
    bins=[-1, 6, 12, 30],
    labels=["Low", "Medium", "High"]
)

In [78]:
df["risk_category"].value_counts(normalize=True)

risk_category
Low       0.5986
Medium    0.2992
High      0.1022
Name: proportion, dtype: float64

In [79]:
# ------------------------------------------------
# 6. Length of Stay (Meaningful Regression Target)
# ------------------------------------------------
admission_factor = df["admission_type"].map({
    "Elective": 0,
    "Urgent": 2,
    "Emergency": 4
})

df["length_of_stay"] = (
    2 +
    df["risk_score"] * 0.7 +
    df["num_procedures"] * 0.5 +
    df["icu_admission"] * 3 +
    admission_factor +
    np.random.normal(0, 1, n)
)

df["length_of_stay"] = df["length_of_stay"].clip(lower=1)

In [80]:
df.groupby("risk_category")["length_of_stay"].mean()

  df.groupby("risk_category")["length_of_stay"].mean()


risk_category
Low        6.836320
Medium    10.942772
High      19.151772
Name: length_of_stay, dtype: float64

In [81]:
# Correlation check
df[["risk_score", "length_of_stay"]].corr()


Unnamed: 0,risk_score,length_of_stay
risk_score,1.0,0.877127
length_of_stay,0.877127,1.0


In [82]:
# ICU proportion
pd.crosstab(df["risk_category"], df["icu_admission"], normalize="index")


icu_admission,0,1
risk_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Low,1.0,0.0
Medium,0.919786,0.080214
High,0.18591,0.81409


In [83]:
# Conditions vs risk
df.groupby("risk_category")["num_conditions"].mean()

  df.groupby("risk_category")["num_conditions"].mean()


risk_category
Low       0.732041
Medium    1.855615
High      2.575342
Name: num_conditions, dtype: float64

In [84]:
# ------------------------------------------------
# 7. Billing Information (Optional for ML / Dashboard)
# ------------------------------------------------
df["total_claim_cost"] = (
    df["length_of_stay"] * 2000 +
    df["num_procedures"] * 1500 +
    np.random.normal(0, 1000, n)
).clip(500, None)

df["payer_coverage"] = np.where(
    df["insurance_type"] == "Private", 0.8,
    np.where(df["insurance_type"] == "Government", 0.6, 0.3)
)


In [85]:
# ------------------------------------------------
# 8. Patient Feedback (For NLP/Sentiment)
# ------------------------------------------------
feedback_options = [
    "Excellent care and professional staff.",
    "Good treatment but waiting time was long.",
    "Average experience overall.",
    "Unsatisfied with hospital service.",
    "Very poor communication and delay."
]

df["patient_feedback"] = np.random.choice(feedback_options, n)

In [86]:
# ------------------------------------------------
# 9. Introduce Missing Values (~15%)
# ------------------------------------------------
cols_with_missing = [
    "bmi", "cholesterol",
    "blood_glucose", "hemoglobin",
    "creatinine"
]

for col in cols_with_missing:
    mask = np.random.rand(n) < 0.15
    df.loc[mask, col] = np.nan

print("Dataset Created Successfully")
print("Shape:", df.shape)

df.head()

Dataset Created Successfully
Shape: (5000, 26)


Unnamed: 0,age,gender,admission_type,insurance_type,bmi,systolic_bp,diastolic_bp,blood_glucose,cholesterol,heart_rate,...,num_conditions,num_procedures,num_medications,icu_admission,risk_score,risk_category,length_of_stay,total_claim_cost,payer_coverage,patient_feedback
0,69,Male,Elective,Government,24.198234,152.149985,91.246475,165.344609,210.342343,88.465785,...,3,0,3,0,12,Medium,10.54815,21709.929543,0.6,Good treatment but waiting time was long.
1,32,Female,Elective,Government,26.123209,156.112911,74.901063,138.279525,225.434062,65.557307,...,2,0,6,0,7,Medium,7.630583,15452.94621,0.6,Unsatisfied with hospital service.
2,89,Male,Elective,Government,27.658751,94.454724,73.024929,99.496088,153.370278,98.588967,...,1,1,5,0,4,Low,4.674591,9315.500867,0.6,Average experience overall.
3,78,Male,Emergency,Private,28.812254,139.520216,85.082641,70.0,,68.357545,...,0,1,4,0,4,Low,9.685285,20720.812733,0.8,Average experience overall.
4,38,Male,Emergency,Private,26.925373,103.863573,70.255702,70.0,167.883664,73.221997,...,1,0,3,0,4,Low,9.780789,18763.532501,0.8,Unsatisfied with hospital service.


In [87]:
# ------------------------------------------------
# 10. Saved the dataset
# ------------------------------------------------
import os

# Create folders if not exist
os.makedirs("../data/raw", exist_ok=True)

# Save dataset
df.to_csv("../data/raw/healthcare_dataset.csv", index=False)

print("Dataset saved successfully in data/raw folder.")

Dataset saved successfully in data/raw folder.


In [88]:
df.groupby("risk_category")["length_of_stay"].mean()

  df.groupby("risk_category")["length_of_stay"].mean()


risk_category
Low        6.836320
Medium    10.942772
High      19.151772
Name: length_of_stay, dtype: float64

In [89]:
#ICU vs Risk
pd.crosstab(df["risk_category"], df["icu_admission"], normalize="index")    

icu_admission,0,1
risk_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Low,1.0,0.0
Medium,0.919786,0.080214
High,0.18591,0.81409


In [90]:
#Hypertension vs Risk
df.groupby("hypertension")["systolic_bp"].mean()

hypertension
0    119.788957
1    152.933048
Name: systolic_bp, dtype: float64

In [91]:
#Heart Disease vs cholesterol+age   
df.groupby("heart_disease")[["cholesterol", "age"]].mean()

Unnamed: 0_level_0,cholesterol,age
heart_disease,Unnamed: 1_level_1,Unnamed: 2_level_1
0,194.320202,51.907864
1,260.776452,72.795796


In [92]:
#risk score correlation check
df[["risk_score", "length_of_stay"]].corr()

Unnamed: 0,risk_score,length_of_stay
risk_score,1.0,0.877127
length_of_stay,0.877127,1.0


In [93]:
#class balance check
df["risk_category"].value_counts(normalize=True)

risk_category
Low       0.5986
Medium    0.2992
High      0.1022
Name: proportion, dtype: float64

In [94]:
#distribution of length of stay
df["length_of_stay"].describe()

count    5000.000000
mean        9.323610
std         4.656670
min         1.000000
25%         6.184493
50%         8.532612
75%        11.130218
max        27.431176
Name: length_of_stay, dtype: float64

In [96]:
df.groupby("diabetes")["blood_glucose"].mean()

diabetes
0    109.716078
1    190.920916
Name: blood_glucose, dtype: float64

In [95]:
df["diabetes"].value_counts(normalize=True)

diabetes
0    0.7186
1    0.2814
Name: proportion, dtype: float64