# Inspect synthea dataset

In [2]:
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
import json
assert(load_dotenv())



Load Synthea Data

In [8]:
data_dir = Path("data/synthea")
dataframes = {}

for csv_file in data_dir.glob("*.csv"):
    key = csv_file.stem  # filename without .csv
    df = pd.read_csv(csv_file, header=0)
    dataframes[key] = df

# write human readable jsons for each dataframe to data/synthea/sample_structure.md
with open("data/synthea/sample_structure.md", "w", encoding="utf-8") as f:
    for key, df in dataframes.items():
        if df.empty:
            continue
        f.write(f"## {key}\n\n")
        f.write(f"```json\n{json.dumps(df.iloc[0].to_dict(), indent=2, ensure_ascii=False)}\n```")
        f.write("\n\n")


for key, df in dataframes.items():
    print(f"{key}:\t{sorted(df.columns.tolist())}")


allergies:	['CODE', 'DESCRIPTION', 'ENCOUNTER', 'PATIENT', 'START', 'STOP']
careplans:	['CODE', 'DESCRIPTION', 'ENCOUNTER', 'Id', 'PATIENT', 'REASONCODE', 'REASONDESCRIPTION', 'START', 'STOP']
conditions:	['CODE', 'DESCRIPTION', 'ENCOUNTER', 'PATIENT', 'START', 'STOP']
devices:	['CODE', 'DESCRIPTION', 'ENCOUNTER', 'PATIENT', 'START', 'STOP', 'UDI']
encounters:	['BASE_ENCOUNTER_COST', 'CODE', 'DESCRIPTION', 'ENCOUNTERCLASS', 'Id', 'ORGANIZATION', 'PATIENT', 'PAYER', 'PAYER_COVERAGE', 'PROVIDER', 'REASONCODE', 'REASONDESCRIPTION', 'START', 'STOP', 'TOTAL_CLAIM_COST']
imaging_studies:	['BODYSITE_CODE', 'BODYSITE_DESCRIPTION', 'DATE', 'ENCOUNTER', 'Id', 'MODALITY_CODE', 'MODALITY_DESCRIPTION', 'PATIENT', 'SOP_CODE', 'SOP_DESCRIPTION']
immunizations:	['BASE_COST', 'CODE', 'DATE', 'DESCRIPTION', 'ENCOUNTER', 'PATIENT']
medications:	['BASE_COST', 'CODE', 'DESCRIPTION', 'DISPENSES', 'ENCOUNTER', 'PATIENT', 'PAYER', 'PAYER_COVERAGE', 'REASONCODE', 'REASONDESCRIPTION', 'START', 'STOP', 'TOTALCOS

In [5]:
# check if all patient ids occour in the allergies table
all_patient_ids = set(dataframes["patients"]["Id"])
allergy_patient_ids = set(dataframes["allergies"]["PATIENT"])
print(f"all allergy patient ids in patients table?: {allergy_patient_ids.issubset(all_patient_ids)}")
print(f"all patient ids in allergies table?: {all_patient_ids.issubset(allergy_patient_ids)}")
print(f"Percentage of patients that have an allergy?: {len(allergy_patient_ids) / len(all_patient_ids) * 100:.2f}%")


all allergy patient ids in patients table?: True
all patient ids in allergies table?: False
Percentage of patients that have an allergy?: 12.04%


In [6]:
# which tables have patient ids
patient_id_tables = [key for key, df in dataframes.items() if "PATIENT" in df.columns]
print(f"Tables with patient IDs: {patient_id_tables}")
print(f"tables with patient id and no encounter id: {[key for key, df in dataframes.items() if 'PATIENT' in df.columns and 'ENCOUNTER' not in df.columns]}")

# are there patients that are associated to multiple encounters?
encounter_patient_ids = dataframes["encounters"]["PATIENT"]
print(f"are there patients that are associated to multiple encounters?: {len(encounter_patient_ids) > len(set(encounter_patient_ids))}")

Tables with patient IDs: ['allergies', 'careplans', 'conditions', 'devices', 'encounters', 'imaging_studies', 'immunizations', 'medications', 'observations', 'payer_transitions', 'procedures', 'supplies']
tables with patient id and no encounter id: ['encounters', 'payer_transitions']
are there patients that are associated to multiple encounters?: True


In [8]:
patients = dataframes["patients"]
patient = patients.sample()
print(patient.to_dict())

{'Id': {659: '40c83ae5-ba34-483e-b348-b558b44179ca'}, 'BIRTHDATE': {659: '1925-04-22'}, 'DEATHDATE': {659: '2003-10-27'}, 'SSN': {659: '999-22-4520'}, 'DRIVERS': {659: 'S99931941'}, 'PASSPORT': {659: 'X24586524X'}, 'PREFIX': {659: 'Mr.'}, 'FIRST': {659: 'Mickey576'}, 'LAST': {659: 'Maggio310'}, 'SUFFIX': {659: nan}, 'MAIDEN': {659: nan}, 'MARITAL': {659: 'M'}, 'RACE': {659: 'black'}, 'ETHNICITY': {659: 'nonhispanic'}, 'GENDER': {659: 'M'}, 'BIRTHPLACE': {659: 'Boston  Massachusetts  US'}, 'ADDRESS': {659: '304 Gorczany Mall Unit 40'}, 'CITY': {659: 'Dennis'}, 'STATE': {659: 'Massachusetts'}, 'COUNTY': {659: 'Barnstable County'}, 'ZIP': {659: 2638.0}, 'LAT': {659: 41.70711038322165}, 'LON': {659: -70.21703859774641}, 'HEALTHCARE_EXPENSES': {659: 1717582.61}, 'HEALTHCARE_COVERAGE': {659: 18747.24}}
