# 01 – Generate 1 000 synthetic patients with Synthea
Run this notebook **only if** you want to re-create the raw data.

In [1]:
# check Java\
!java -version


openjdk version "21.0.8" 2025-07-15
OpenJDK Runtime Environment (build 21.0.8+9-Ubuntu-0ubuntu124.04.1)
OpenJDK 64-Bit Server VM (build 21.0.8+9-Ubuntu-0ubuntu124.04.1, mixed mode, sharing)


In [2]:
import os, urllib.request, pathlib

# absolute path **outside** the repo
jar_path = pathlib.Path.home() / "synthea_work" / "synthea.jar"
jar_path.parent.mkdir(exist_ok=True)

if not jar_path.exists():
    url = ("https://github.com/synthetichealth/synthea/releases/download/"
           "v3.2.0/synthea-with-dependencies.jar")
    urllib.request.urlretrieve(url, jar_path)

print("Synthea jar:", jar_path)

Synthea jar: /home/luka/synthea_work/synthea.jar


In [3]:
config = """\
{
  "exporter": {
    "csv.export": true,
    "csv.folder": "./synthea_csv",
    "years_of_history": 4
  },
  "generate": {
    "population": 1000,
    "alive_only": true,
    "min_age": 18,
    "max_age": 85
  },
  "geography": {
    "default_country": "Germany"
  }
}
"""

# save config next to the jar
config_path = jar_path.with_name("synthea_config.json")
config_path.write_text(config)
print("Config written to", config_path)

Config written to /home/luka/synthea_work/synthea_config.json


In [4]:
# run inside the jar folder so outputs land beside it
%cd {jar_path.parent}
!java -Xms2G -Xmx4G -jar {jar_path.name} -p 1000 -c {config_path.name}

/home/luka/synthea_work
SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.
Scanned 84 modules and 140 submodules.
Loading submodule modules/allergies/allergy_panel.json
Loading submodule modules/allergies/drug_allergy_incidence.json
Loading submodule modules/allergies/environmental_allergy_incidence.json
Loading submodule modules/allergies/food_allergy_incidence.json
Loading submodule modules/allergies/immunotherapy.json
Loading submodule modules/allergies/outgrow_env_allergies.json
Loading submodule modules/allergies/outgrow_food_allergies.json
Loading submodule modules/allergies/severe_allergic_reaction.json
Loading submodule modules/anemia/anemia_sub.json
Loading submodule modules/breast_cancer/chemotherapy_breast.json
Loading submodule modules/breast_cancer/hormone_diagnosis.json
Loading submodule modules/breast_cancer/hormonetherapy_breast.json
Loading submo

In [1]:
import shutil, datetime, pathlib, os

# 1. notebook directory → repo raw folder
notebook_dir = pathlib.Path.cwd()              # …/etl_omop_fhir/notebooks
repo_raw = notebook_dir.parent / "data/raw"    # …/etl_omop_fhir/data/raw
repo_raw.mkdir(exist_ok=True)

# 2. source folder outside repo
csv_src = pathlib.Path.home() / "synthea_work/output_csv/csv"

# 3. copy
for fname in ("patients.csv", "conditions.csv", "medications.csv"):
    src = csv_src / fname
    dst = repo_raw / fname
    if src.exists():
        shutil.copy(src, dst)
        print("copied", src.name)
    else:
        print("⚠️  missing", src)

# 4. provenance
(repo_raw / "timestamp.txt").write_text(
    datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
)

print("✅ artefacts ready in", repo_raw.resolve())

copied patients.csv
copied conditions.csv
copied medications.csv
✅ artefacts ready in /home/luka/Documents/dev/federated-learning/federated-learning-mini-project/etl_omop_fhir/data/raw


  datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"


In [3]:
import pandas as pd
pat = pd.read_csv(repo_raw / "patients.csv")
print(f"Rows: {len(pat)}, Columns: {pat.shape[1]}")
pat.head(2)

Rows: 1, Columns: 27


Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,534e7acb-4e82-365a-a7d9-ad4fdbe47c97,1977-06-07,,999-55-6733,S99920096,X20234169X,Mr.,Hollis7,Ebert178,,...,Melrose,Massachusetts,Middlesex County,25017,2176,42.448182,-71.040432,72678.23,5740.79,136774
