# Inspect synthea dataset

In [None]:
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
import json
assert(load_dotenv())

Load Synthea Data

In [None]:
data_dir = Path("data/synthea")
dataframes = {}

for csv_file in data_dir.glob("*.csv"):
    key = csv_file.stem  # filename without .csv
    df = pd.read_csv(csv_file, header=0)
    dataframes[key] = df

# write human readable jsons for each dataframe to data/synthea/sample_structure.md
with open("data/synthea/sample_structure.md", "w", encoding="utf-8") as f:
    for key, df in dataframes.items():
        if df.empty:
            continue
        f.write(f"## {key}\n\n")
        f.write(f"```json\n{json.dumps(df.iloc[0].to_dict(), indent=2, ensure_ascii=False)}\n```")
        f.write("\n\n")


for key, df in dataframes.items():
    print(f"{key}:\t{sorted(df.columns.tolist())}")


In [None]:
# check if all patient ids occour in the allergies table
all_patient_ids = set(dataframes["patients"]["Id"])
allergy_patient_ids = set(dataframes["allergies"]["PATIENT"])
print(f"all allergy patient ids in patients table?: {allergy_patient_ids.issubset(all_patient_ids)}")
print(f"all patient ids in allergies table?: {all_patient_ids.issubset(allergy_patient_ids)}")
print(f"Percentage of patients that have an allergy?: {len(allergy_patient_ids) / len(all_patient_ids) * 100:.2f}%")


In [None]:
# which tables have patient ids
patient_id_tables = [key for key, df in dataframes.items() if "PATIENT" in df.columns]
print(f"Tables with patient IDs: {patient_id_tables}")
print(f"tables with patient id and no encounter id: {[key for key, df in dataframes.items() if 'PATIENT' in df.columns and 'ENCOUNTER' not in df.columns]}")

# are there patients that are associated to multiple encounters?
encounter_patient_ids = dataframes["encounters"]["PATIENT"]
print(f"are there patients that are associated to multiple encounters?: {len(encounter_patient_ids) > len(set(encounter_patient_ids))}")

In [None]:
patients = dataframes["patients"]
patient = patients.sample()
print(patient.to_dict())

In [None]:
# print 5 sample values for each column in encounters
encounter_df = dataframes["encounters"]

print("Encounters Table - Sample Values:\n")
print("=" * 80)

for col in encounter_df.columns:
    print(f"\n{col}:")
    samples = encounter_df[col].head(10).tolist()
    for i, value in enumerate(samples, 1):
        print(f"  {i}. {value}")


In [None]:
# list all tables that do have a colum "ENCOUNTER"
tables_with_encounter = [name for name, df in dataframes.items() if "ENCOUNTER" in df.columns]
print(tables_with_encounter)