This notebook is meant to be used to generate synthetic patient data using Synthea. Run this notebook from top to bottom the first time and it will create a volume in the Unity Catalog destination that you configure, and then it will generate CSV files for mock EMR data and TXT files for mock clinical notes. Finally, it will turn the files in the volume into various Delta tables. Note that to generate mock EMR data and clinical notes for 100 patients it will take about two minutes for the data generation process to complete. 

**In Cell 3:** check that JDK 17 is installed on your cluster. Synthea runs on Java Development Kit (JDK) 17, so use a cluster that has DBR 16.0 or above, as JDK 17 is the default in such a cluster. As of June 30, 2025 Serverless compute does not have JDK 17.

In [0]:
%sh
java -version

In [0]:
# Create widgets that will set the values for catalog name, schema name, and volume name:

dbutils.widgets.text(name = "catalog_name", defaultValue="", label="Catalog Name")
dbutils.widgets.text(name = "schema_name", defaultValue="", label="Schema Name")
dbutils.widgets.text(name = "volume_name", defaultValue="", label="Volume Name")

# Create widgets that will set the scope for the synthetic data that you want Synthea to generate:

dbutils.widgets.text(name = "num_patients", defaultValue="100", label="Number of Patients")
dbutils.widgets.text(name = "generate_notes", defaultValue="false", label="Generate Notes")
dbutils.widgets.text(name = "geographic_location", defaultValue="Utah", label="Geographic Location")

In [0]:
# Grab the widget values

catalog_name = dbutils.widgets.get(name = "catalog_name")
schema_name = dbutils.widgets.get(name = "schema_name")
volume_name = dbutils.widgets.get(name = "volume_name")
volume_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/"
num_patients = dbutils.widgets.get(name = "num_patients")
generate_notes = dbutils.widgets.get(name = "generate_notes")
geographic_location = dbutils.widgets.get(name = "geographic_location")


In [0]:
%sql
create catalog if not exists IDENTIFIER(:catalog_name);
use catalog IDENTIFIER(:catalog_name);

In [0]:
%sql
create schema if not exists IDENTIFIER(:schema_name);
use schema IDENTIFIER(:schema_name);

In [0]:
%sql
create volume if not exists IDENTIFIER(:volume_name);

In [0]:
# Retrieve the latest Synthea JAR file from the Synthea Github repo and save it in your volume

from urllib.request import urlretrieve
urlretrieve(url = "https://github.com/synthetichealth/synthea/releases/download/master-branch-latest/synthea-with-dependencies.jar"
           , filename = f"{volume_path}synthea-with-dependencies.jar"
)

In [0]:
# Create a Synthea configuration file and write it to your volume
config_file_text = (
f"""
generate.default_population = {num_patients}
exporter.clinical_note.export = {generate_notes}
exporter.ccda.export = false
exporter.fhir.export = false
exporter.csv.export = true
exporter.csv.folder_per_run = true
exporter.baseDirectory = ./output/
generate.append_numbers_to_person_names = false
""")

filename = f"{volume_path}synthea_config.txt"

with open(filename, "w") as f:
    f.write(config_file_text)

f.close()

In [0]:
# Set up the Synthea JAR file to run with all of your configurations
def data_generator():
  command = (
  f"""cd {volume_path}
  java -jar synthea-with-dependencies.jar -c {volume_path}synthea_config.txt {geographic_location}
  """)
  result = subprocess.run([command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
  return result

In [0]:
# Run the Synthea data generation
import subprocess
run_results = data_generator()

In [0]:
source_volume_path = volume_path
target_volume_path = f"/Volumes/{catalog_name}/{schema_name}/landing/"

In [0]:
# create landing zone volume if not exists
spark.sql(f'CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.landing')

In [0]:
# Copy the new files in /output/csv folders to landing zone

import os

# get directories and order by file name (timestamp) in ascending order (ensure correct processing order)
directories = dbutils.fs.ls(f"{source_volume_path}/output/csv")
directories_sorted = sorted(directories)

# for each directory, get files and move them to landing
for directory in directories_sorted:
  file_path = directory[0]
  directory = directory[1].split('/')[0]
  files = spark.sql(f"LIST '{file_path}' ")
  # define file/directory to ignore
  file_exception = 'data_quality_output_data_quality_output/'  
  print(f"Copying files from directory: {directory} \n source:{file_path}  \n target:{target_volume_path}")

  # get files in given directory
  for file in files.collect():
    # create a folder for the csv based off of file name
    file_path = file[0]
    file_time = file_path.split('/')[-2]
    directory_name = file[1].split('.')[0]
    file_name = file_time + '_' + file[1].split('.')[0]
    
    # check if file exists and copy file
    dst = f"{target_volume_path}{directory_name}/{file_name}.csv"

    if os.path.exists(dst):
      print(f'File already exists, skipping file: {file_name}.csv')
    else:
      print(f'Copying file: {file_name}.csv to target: {target_volume_path}')
      dbutils.fs.cp(f"{file_path}", dst)
  print(f'Successfully copied files to target \n target: {target_volume_path}')

In [0]:
# Ingest the CSV files into Delta tables

# Define the base path to the landing folder
base_path = f"/Volumes/{catalog_name}/{schema_name}/landing/"

# Get all subdirectories in the base path
subdirectories = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]

# Remove the 'notes' folder from the subdirectories list
subdirectories = [d for d in subdirectories if d != 'notes']

print(subdirectories)

# Iterate over each subdirectory and process the files
for subdir in subdirectories:
    file_path = os.path.join(base_path, subdir)
    
    # Read the files into a DataFrame
    df = spark.read.format("csv").option("header", "true").load(file_path)
    
    # Write the DataFrame to a Delta table
    table_name = f"{catalog_name}.{schema_name}.{subdir}"
    df.write.format("delta").mode("overwrite").saveAsTable(table_name)

In [0]:
# SKIP THIS CELL IF YOU CHOSE NOT TO GENERATE NOTES

# Copy the new files in /output/notes folder to landing zone

source_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/output/notes"
target_path = f"/Volumes/{catalog_name}/{schema_name}/landing/notes/"

files = [file.path for file in dbutils.fs.ls(source_path)]

for file in files:
    file_name = file.split('/')[-1]
    dst = f"{target_path}{file_name}"

    if os.path.exists(dst):
        print(f'File already exists, skipping file: {file_name}')
    else:
        print(f'Copying file: {file_name} to target: {target_path}')
        dbutils.fs.cp(file, dst, recurse=True)

print(f'Successfully copied files to target: {target_path}')

In [0]:
# SKIP THIS CELL IF YOU CHOSE NOT TO GENERATE NOTES

# Ingest the notes files (in txt format) into Delta tables

# Define the path to the notes folder
notes_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/output/notes/"

# Get all files in the notes folder
note_files = [f for f in os.listdir(notes_path) if os.path.isfile(os.path.join(notes_path, f))]

# Create a list to hold the file data
data = []

# Iterate over each file and read its content
for note_file in note_files:
    file_path = os.path.join(notes_path, note_file)
    with open(file_path, 'r') as file:
        file_text = file.read()
        data.append((note_file, file_text))

# Create a DataFrame from the data
notes_df = spark.createDataFrame(data, ["file_name", "file_text"])

# Write the DataFrame to a Delta table
notes_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.notes")