In [44]:
import json
from pathlib import Path
from subprocess import run
from urllib.request import urlretrieve

In [47]:
here = Path.cwd().resolve()

synthea_jar = here / "synthea.jar"
synthea_url = "https://github.com/synthetichealth/synthea/releases/download/v3.3.0/synthea-with-dependencies.jar"
java_image = "eclipse-temurin:24-jre"

if not synthea_jar.exists():
    print(f"Downloading {synthea_jar} from {synthea_url}")
    urlretrieve(synthea_url, filename=synthea_jar)

Downloading /Users/minrk/dev/jpy/health/test-data/synthea.jar from https://github.com/synthetichealth/synthea/releases/download/v3.3.0/synthea-with-dependencies.jar


In [64]:
import shutil
from subprocess import run, STDOUT

output_dir = here / "output"
if output_dir.exists():
    shutil.rmtree(output_dir)
    

n_patients = 100
seed = "1"
age = "60-85"

run([
    "docker",
    "run",
    "--rm",
    "-i",
    f"-v{here}:/io",
    "-w/io",
    java_image,
    "java",
    "-jar",
    synthea_jar.name,
    "-p",
    str(n_patients),
    "-s",
    str(seed),
    "-cs",
    str(seed),
    "-a",
    age,
], check=True, stderr=STDOUT)
    
    

org.mitre.synthea.X12.ExporterAdaptor
org.mitre.synthea.X12.ExporterAdaptor
SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.
Scanned 90 modules and 444 submodules.
Loading submodule modules/allergies/allergy_panel.json
Loading submodule modules/allergies/drug_allergy_incidence.json
Loading submodule modules/allergies/environmental_allergy_incidence.json
Loading submodule modules/allergies/food_allergy_incidence.json
Loading submodule modules/allergies/immunotherapy.json
Loading submodule modules/allergies/outgrow_env_allergies.json
Loading submodule modules/allergies/outgrow_food_allergies.json
Loading submodule modules/allergies/severe_allergic_reaction.json
Loading submodule modules/anemia/anemia_sub.json
Loading submodule modules/breast_cancer/chemotherapy_breast.json
Loading submodule modules/breast_cancer/hormone_diagnosis.json
Loading submodule modules/br

CompletedProcess(args=['docker', 'run', '--rm', '-i', '-v/Users/minrk/dev/jpy/health/test-data:/io', '-w/io', 'eclipse-temurin:24-jre', 'java', '-jar', 'synthea.jar', '-p', '100', '-s', '1', '-cs', '1', '-a', '60-85'], returncode=0)

In [65]:
def has_code(entry, codes, system="http://loinc.org"):
    if entry['resourceType'] != 'Observation':
        return False
    if isinstance(codes, str):
        codes = [codes]
    if 'code' in entry:
        for coding in entry['code']['coding']:
            if coding['system'] == system and coding['code'] in codes:
                return True
    if 'component' in entry:
        for component in entry['component']:
            for coding in entry['code']['coding']:
                if coding['system'] == system and coding['code'] in codes:
                    return True
    return False

diastolic_code = '8462-4'
systolic_code = '8480-6'
glucose_code = '2339-0'

keep_codes = {
    diastolic_code, systolic_code, glucose_code,
}
def keep(entry):
    if entry['resource']['resourceType'] == 'Patient':
        return True
    if entry['resource']['resourceType'] == 'Observation' and has_code(entry['resource'], codes=keep_codes):
        return True
    return False


In [73]:
def thin_record(original_path):
    with original_path.open() as f:
        record = json.load(f)

    entries = record['entry']
    new_entries = list(filter(keep, entries))
    print(f"{original_path.name}: reduced {len(entries)} to {len(new_entries)}")
    if len(new_entries) > 50:
        record['entry'] = new_entries
        with (fhir_dir / original_path.name).open("w") as f:
            json.dump(record, f, sort_keys=True)
    else:
        print("Excluding patient without useful data")
    

In [74]:
fhir_dir = here / "fhir"
if fhir_dir.exists():
    shutil.rmtree(fhir_dir)
fhir_dir.mkdir(exist_ok=True)

for original in output_dir.glob("fhir/*.json"):
    thin_record(original)

Katerine813_Cara845_Nicolas769_8c4bfdab-605b-50a1-1c18-4d986719e66d.json: reduced 1016 to 1
Excluding patient without useful data
Kent912_Rempel203_865986ec-5784-4ff4-b59d-a9c02c5c845d.json: reduced 1617 to 11
Excluding patient without useful data
Misha8_Towne435_e441ceed-80e6-d35d-bdf2-e82247f10dae.json: reduced 4069 to 26
Excluding patient without useful data
Jayson808_Towne435_44c95613-f20c-dd4c-9125-05c82a9a2c09.json: reduced 1156 to 22
Excluding patient without useful data
Lashaun800_Charlene806_Weissnat378_0c4a29fe-bc92-ee21-4fec-b88c8474f9c2.json: reduced 1302 to 25
Excluding patient without useful data
Matilda472_Argelia29_Dietrich576_6c38ffc3-f070-b74f-81c9-a90f1c8ec3c2.json: reduced 2903 to 32
Excluding patient without useful data
Kathe603_Coral377_Kassulke119_d8f2b023-9078-cb75-45a7-3cf375588d3b.json: reduced 888 to 13
Excluding patient without useful data
Martina386_Selena146_Schumm995_944f8616-fb51-260f-77b9-0e0003d3d326.json: reduced 510 to 1
Excluding patient without use

In [75]:
!du -hs fhir/*

148K	fhir/Amalia471_Enríquez603_44c7c8a3-85fb-4736-4bd9-8a5640b5bbf8.json
 56K	fhir/Anibal473_Glover433_fcc90947-2e5f-e63a-0815-d22d499742db.json
 56K	fhir/Antione404_Gerhold939_f86ceb29-43dd-03c5-8857-0f147d160b79.json
112K	fhir/Bart73_Predovic534_184cf049-bb4e-91c1-0a44-41c9512eee0c.json
 72K	fhir/Berniece493_Minnie888_Pfeffer420_dc159f85-20e0-4573-dbad-c6cb001c7b56.json
168K	fhir/Carter549_Haag279_6252ef78-e442-3081-f63b-36435c505a7f.json
116K	fhir/Clint766_Ward668_8224be4b-6b94-3b95-9af4-3899490d2db8.json
 76K	fhir/Dalton260_Tromp100_bcb06b48-9557-166b-3335-f067a4abd99c.json
 80K	fhir/Dirk334_Botsford977_d95870fb-f087-6e0d-8b5a-0740b92ae585.json
128K	fhir/Doyle959_Schumm995_c7adee05-ed06-33af-f1f8-6ea07572ba8b.json
124K	fhir/Earlene410_Latosha740_Schumm995_e305a7bb-b72d-bdb2-860b-a0fceebf7116.json
 84K	fhir/Elane105_Caroyln232_Huels583_699ce5c6-eddf-8f6a-6b48-3ec9a2ec40ec.json
128K	fhir/Evonne919_Fahey393_f09be4f7-eb59-e9b5-2ecc-0c713c4b103a.json
180K	fhir/Foster87_Stamm704_d4039c3