
# FAE-based Synthetic Patient Sampler

Steps:
1) Load and filter specialties 
2) Fit per-diagnosis distributions (sex, age, wait, LOS)
4) Sample N pseudo-patients and save

In [None]:

import pandas as pd, numpy as np, math, re
from pathlib import Path
import random

In [None]:
INPUT_CSV = Path("../data/hosp-epis-stat-admi-diag-2024-25.csv")
OUTPUT_CSV = Path("../data/sampled_patients.csv")
SPECIALTIES_KEEP = ['Ophthalmology', 'Geriatric Medicine', 'Trauma and Orthopaedics', 'Paediatrics', 'Gastroenterology', 'General Surgery', 'General Internal Medicine']
np.random.seed(123)

In [39]:

df = pd.read_csv(INPUT_CSV)
df.columns = [re.sub(r"\s+", " ", c.strip()) if isinstance(c,str) else c for c in df.columns]
age_cols = [c for c in df.columns if re.match(r"age_*", c)]


In [40]:
def fit_lognormal(mean, median):
    mean = float(mean) if mean and not pd.isna(mean) else None
    median = float(median) if median and not pd.isna(median) else None
    if not mean or mean <= 0 or not median or median <= 0:
        return math.log(max((mean or 1.0), 1.0)), 0.5
    if mean <= median:
        return math.log(median), 0.25
    sigma2 = 2.0 * math.log(mean/median)
    sigma = math.sqrt(max(sigma2, 1e-6))
    mu = math.log(median)
    return mu, sigma

def parse_age_bin(label: str):
    s = label.strip().replace("age_", "").replace(" (FCE)", "")
    if "+" in s:
        lo = int(s.replace("+","")); return lo, lo+11
    if "to" in s:
        lo, hi = s.split("to"); return int(lo), int(hi)+1
    return int(s), int(s)+1

age_bins = [parse_age_bin(c) for c in age_cols]
print("Age bins:", age_bins)


Age bins: [(0, 20), (20, 25), (25, 30), (30, 35), (35, 40), (40, 45), (45, 50), (50, 55), (55, 60), (60, 65), (65, 70), (70, 75), (75, 80), (80, 85), (85, 90), (90, 101)]


In [41]:
df.columns

Index(['priamryDiagCode', 'primaryDiagDescription', 'mappedSpeciality',
       'finishedConsultantEpisodes', 'finishedAdmissionEpisodes', 'male',
       'female', 'genderUnknown', 'emergency', 'waitingList', 'planned',
       'other', 'meanTimeWaited', 'medianTimeWaited', 'meanLengthOfStay',
       'medianLengthOfStay', 'meanAge', 'age_0to19', 'age_20to24',
       'age_25to29', 'age_30to34', 'age_35to39', 'age_40to44', 'age_45to49',
       'age_50to54', 'age_55to59', 'age_60to64', 'age_65to69', 'age_70to74',
       'age_75to79', 'age_80to84', 'age_85to89', 'age_90+', 'dayCase',
       'FCEBedDays'],
      dtype='object')

In [42]:
df["male"]

0       94452
1        4636
2         294
3      138332
4        1514
        ...  
210    185505
211     61985
212       489
213      9186
214      8206
Name: male, Length: 215, dtype: int64

In [48]:
param_rows = []
for _, r in df.iterrows():
    total_fae = float(r["finishedAdmissionEpisodes"] or 0.0)
    if total_fae <= 0: 
        continue
    male = float(r["male"] or 0.0)
    female = float(r["female"] or 0.0)
    gender_total = male + female # ignore unknowns
    p_male = (male / gender_total) if gender_total > 0 else 0.5

    ages = r[age_cols].to_numpy() if age_cols else None
    age_probs = (ages / ages.sum()) if (ages is not None and ages.sum() > 0) else None

    mu_w, sig_w = fit_lognormal(r["meanTimeWaited"],   r["medianTimeWaited"])
    mu_l, sig_l = fit_lognormal(r["meanLengthOfStay"],    r["medianLengthOfStay"])

    param_rows.append(dict(
        code=r.get("priamryDiagCode", ""),
        desc=r.get("primaryDiagDescription", ""),
        speciality=r["mappedSpeciality"],
        total_fae=total_fae,
        p_male=p_male,
        age_probs=(age_probs.tolist() if age_probs is not None else None),
        mu_wait=mu_w, sigma_wait=sig_w,
        mu_los=mu_l,  sigma_los=sig_l
    ))

params = pd.DataFrame(param_rows)
print("Param rows:", len(params))
params.head(3)

Param rows: 215


Unnamed: 0,code,desc,speciality,total_fae,p_male,age_probs,mu_wait,sigma_wait,mu_los,sigma_los
0,A00-A09,Intestinal infectious diseases ...,General Internal Medicine,150546.0,0.432712,"[0.1923027387052557, 0.035211267605633804, 0.0...",2.995732,1.084239,0.0,1.665109
1,A15-A19,Tuberculosis ...,General Internal Medicine,2971.0,0.671981,"[0.0773673810609515, 0.11840705274357805, 0.13...",1.609438,1.435005,2.302585,1.133008
2,A20-A28,Certain zoonotic bacterial diseases,General Internal Medicine,245.0,0.672769,"[0.10514018691588785, 0.030373831775700934, 0....",1.609438,1.771223,1.609438,0.96954


In [76]:
rng = np.random.default_rng(123)
N = 1000
weights = params["total_fae"].to_numpy(dtype=float); weights = weights / weights.sum()

def sample_age(age_probs, age_bins):
    if age_probs is not None and len(age_probs) == len(age_bins):
        aidx = rng.choice(len(age_bins), p=np.array(age_probs))
    else:
        aidx = rng.integers(0, len(age_bins)) if len(age_bins) else 0
    lo, hi = age_bins[aidx] if len(age_bins) else (50, 80)
    return int(rng.integers(lo, hi))

rows = []
for i in range(N):
    d = params.iloc[rng.choice(len(params), p=weights)]
    sex = "M" if rng.random() < float(d.p_male) else "F"
    age = sample_age(d.age_probs if isinstance(d.age_probs, list) else None, age_bins)
    complexity = random.uniform(0.5, 1.5)
    acuity = random.randint(1,5)
    code = str(d.code)
    wait = float(np.exp(float(d.mu_wait) + float(d.sigma_wait)*rng.standard_normal()))
    los  = float(np.exp(float(d.mu_los)  + float(d.sigma_los) *rng.standard_normal()))
    los = los*1.1 if complexity > 1.2 else los*0.9 if complexity < 0.8 else los
    timeAdmission = round(0.9*random.random()*los)
    timeDischarge = los - timeAdmission
    dischargeDependence = "High" if random.random() < 0.3 else "Low"
    vitals = "Improving" if random.random() < 0.7 else \
        "Stable" if random.random() < 0.9 else "Deteriorating"
    nextAction = "Assessment" if timeAdmission < 1.5 and los > 3 else \
        "Discharge" if timeDischarge < 0.3*los else \
            "Diagnostics/Investigation" if random.random() < 0.33 else \
                "Treatment" if random.random() <0.66 else \
                    "Review"
    blocker = "Awaiting Social Care" if nextAction =="Discharge" and dischargeDependence == "High" else \
        "Awaiting Test Results" if nextAction == "Review" and random.random() < 0.3 else \
            "Radiology Capacity" if nextAction == "Diagnostics/Investigation" and random.random() < 0.3 else \
                "Bed Availability" if nextAction == "Assessment" and random.random() < 0.3 else \
                    "Theatre slot availability" if nextAction == "Treatment" and random.random() < 0.3 else \
                        "Staff Availability" if random.random() < 0.3 else \
                            "No Blocker"
    rows.append({
        "pseudo_patient_id": f"SP{str(i+1).zfill(4)}",
        "age": age,
        "sex": sex,
        "Complexity": round(complexity,2),
        "Acuity": acuity,
        "Primary Diagnosis Summary": code,
        "Speciality": d.speciality,
        "Vitals Trend": vitals,
        "Waiting Time (days)": max(0, int(round(wait))),
        "Length of Stay (days)": max(1, int(round(los))),
        "Time since Admission (days)": max(0, int(round(timeAdmission))),
        "Time to Discharge (days)": max(0, int(round(timeDischarge))),
        "nextAction": nextAction,
        "blocker": blocker,
        "Discharge Dependence": dischargeDependence
    })



In [77]:
sampled = pd.DataFrame(rows)
sampled.to_csv(OUTPUT_CSV, index=False)
print(f"Saved to {{OUTPUT_CSV.resolve()}}")
sampled.head(100)

Saved to {OUTPUT_CSV.resolve()}


Unnamed: 0,pseudo_patient_id,age,sex,Complexity,Acuity,Primary Diagnosis Summary,Speciality,Vitals Trend,Waiting Time (days),Length of Stay (days),Time since Admission (days),Time to Discharge (days),nextAction,blocker,Discharge Dependence
0,SP0001,31,F,1.28,5,N80-N98,General Internal Medicine,Improving,164,1,0,1,Review,No Blocker,High
1,SP0002,80,M,1.22,3,S80-S89,Trauma and Orthopaedics,Improving,6,3,2,1,Discharge,No Blocker,Low
2,SP0003,67,F,0.67,3,H25-H28,Ophthalmology,Improving,41,1,0,1,Diagnostics/Investigation,No Blocker,High
3,SP0004,63,F,0.72,1,G60-G64,General Internal Medicine,Improving,1,15,5,10,Diagnostics/Investigation,Staff Availability,Low
4,SP0005,41,F,1.11,3,K55-K64,Gastronenterology,Improving,120,9,4,5,Diagnostics/Investigation,No Blocker,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,SP0096,66,F,1.40,3,I60-I69,Geriatric Medicine,Improving,7,5,2,3,Treatment,No Blocker,Low
96,SP0097,26,F,1.14,1,O30-O48,General Internal Medicine,Stable,8,1,0,1,Treatment,No Blocker,Low
97,SP0098,48,F,1.30,2,N10-N16,General Internal Medicine,Stable,56,2,0,2,Treatment,No Blocker,Low
98,SP0099,67,F,1.44,5,R00-R09,General Internal Medicine,Improving,43,2,0,2,Diagnostics/Investigation,Radiology Capacity,Low
