In [4]:
import requests
import zipfile
import os
import io
import pandas as pd

# 1. Download the file
url = "https://synthetichealth.github.io/synthea-sample-data/downloads/latest/synthea_sample_data_csv_latest.zip"
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful

# 2. Extract the ZIP file into a temporary directory
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    # Create a temporary directory to store the files
    extracted_path = '/tmp/synthea_sample_data'
    os.makedirs(extracted_path, exist_ok=True)
    zip_ref.extractall(extracted_path)
    print(f"Extracted to {extracted_path}")

# 3. List files extracted
extracted_files = os.listdir(extracted_path)
print("Extracted files:", extracted_files)


# 4. Create DataFrames for each CSV file
csv_files = [
    "careplans.csv", 
    "conditions.csv", 
    "encounters.csv", 
    "medications.csv", 
    "observations.csv", 
    "patients.csv", 
    "procedures.csv", 
    "payers.csv", 
    "claims.csv",
    'organizations.csv',
    'providers.csv',
]

# Dictionary to store DataFrames
dataframes_100 = {}

# Read each CSV file into a unique DataFrame
for csv_file in csv_files:
    file_path = os.path.join(extracted_path, csv_file)
    if os.path.exists(file_path):
        # Load the CSV into a Pandas DataFrame
        dataframes_100[csv_file] = pd.read_csv(file_path)
        print(f"Loaded {csv_file}")
    else:
        print(f"{csv_file} not found in the extracted files.")

Extracted to /tmp/synthea_sample_data
Extracted files: ['allergies.csv', 'careplans.csv', 'claims.csv', 'claims_transactions.csv', 'conditions.csv', 'devices.csv', 'encounters.csv', 'imaging_studies.csv', 'immunizations.csv', 'medications.csv', 'observations.csv', 'organizations.csv', 'patients.csv', 'payers.csv', 'payer_transitions.csv', 'procedures.csv', 'providers.csv', 'supplies.csv']
Loaded careplans.csv
Loaded conditions.csv
Loaded encounters.csv
Loaded medications.csv
Loaded observations.csv
Loaded patients.csv
Loaded procedures.csv
Loaded payers.csv
Loaded claims.csv
Loaded organizations.csv
Loaded providers.csv


In [5]:
careplans_df_100 = dataframes_100.get('careplans.csv')
conditions_df_100 = dataframes_100.get('conditions.csv')
encounters_df_100 = dataframes_100.get('encounters.csv')
medications_df_100 = dataframes_100.get('medications.csv')
observations_df_100 = dataframes_100.get('observations.csv')
patients_df_100 = dataframes_100.get('patients.csv')
procedures_df_100 = dataframes_100.get('procedures.csv')
payers_df_100 = dataframes_100.get('payers.csv')
claims_df_100 = dataframes_100.get('claims.csv')
organizations_df_100 = dataframes_100.get('organizations.csv')
providers_df_100 = dataframes_100.get('providers.csv')

In [6]:
# cleaning patient and condition files for merging
#rename Id to patient_id
patients_df_100.rename(columns={'Id': 'patient_id',}, inplace=True)
display(patients_df_100.head())

#rename patient_id to patient_id
conditions_df_100.rename(columns={'PATIENT': 'patient_id', 'ENCOUNTER': 'encounter_id', 'CODE':'condition_code' ,'DESCRIPTION': 'condition_description', 'START':'condition_start'}, inplace=True)
conditions_df_100.head()

Unnamed: 0,patient_id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,MIDDLE,LAST,...,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,dbaa48b1-720a-f5f2-9360-ae6bb9321037,2022-04-19,,999-46-8625,,,,Georgine810,Caroyln232,Jacobs452,...,Marlborough,Massachusetts,Middlesex County,25017.0,1752,42.30859,-71.567107,6002.42,0.0,60469
1,8313967b-6432-89aa-090f-4d2b81558360,1989-12-26,,999-31-6504,S99917070,X12534338X,Mrs.,Suzanna632,Karissa612,Fay398,...,Leominster,Massachusetts,Worcester County,25027.0,1420,42.615008,-71.775108,257357.78,523549.41,43633
2,c525e0a9-c37c-419c-db08-86080b4b774d,1988-07-06,,999-84-5991,S99966682,X57704278X,Mr.,Johnson679,Warren653,Mann644,...,Plymouth,Massachusetts,Plymouth County,25023.0,2360,41.896981,-70.66451,7576.94,161526.02,12746
3,ecb49d9e-4eb1-4743-237d-d8020ecd4f86,1943-10-22,,999-81-8785,S99942513,X26350147X,Mr.,Freeman822,Lamont867,Schowalter414,...,Lawrence,Massachusetts,Essex County,25009.0,1843,42.721493,-71.165165,22735.16,527081.23,14577
4,8bab136f-0ae3-e4be-3ef0-62d007e0d267,2004-02-29,,999-93-6499,S99940744,X71774583X,Mr.,Barrett790,Alfredo17,McDermott739,...,Boston,Massachusetts,Suffolk County,25025.0,2111,42.3117,-71.107352,3621.16,37226.18,24550


Unnamed: 0,condition_start,STOP,patient_id,encounter_id,SYSTEM,condition_code,condition_description
0,2022-04-19,2023-09-26,dbaa48b1-720a-f5f2-9360-ae6bb9321037,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,http://snomed.info/sct,314529007,Medication review due (situation)
1,2024-03-26,2024-03-26,dbaa48b1-720a-f5f2-9360-ae6bb9321037,7fe8c1c8-f019-976e-4a70-e49dbaec8cd5,http://snomed.info/sct,314529007,Medication review due (situation)
2,2024-09-24,2024-09-24,dbaa48b1-720a-f5f2-9360-ae6bb9321037,1f72daa0-6d9d-4e3c-d317-031ab62f0431,http://snomed.info/sct,314529007,Medication review due (situation)
3,2025-03-25,2025-03-25,dbaa48b1-720a-f5f2-9360-ae6bb9321037,12f7aeb1-70ca-ed6f-2c61-2fbb7ac53d33,http://snomed.info/sct,314529007,Medication review due (situation)
4,2005-05-28,,8313967b-6432-89aa-090f-4d2b81558360,abf41c45-cb23-5525-633b-ba6929ef0ff6,http://snomed.info/sct,197927001,Recurrent urinary tract infection (disorder)


In [7]:
#merge patients with their conditions
patient_conditions = pd.merge(
    patients_df_100[["patient_id", "BIRTHDATE", "GENDER", "BIRTHDATE",'ZIP','INCOME']],
    conditions_df_100[["patient_id", "encounter_id","condition_code","condition_description", "condition_start"]],
    on="patient_id",
    how="left"
)

In [8]:
#cleaning procedures file for merging

procedures_df_100.rename(columns={'PATIENT': 'patient_id', 'ENCOUNTER': 'encounter_id', 'CODE':'procedure_code' ,'DESCRIPTION': 'procedure_description', 'START':'procedure_date', 'BASE_COST':'procedure_cost', 'REASONCODE':'reason_code','REASONDESCRIPTION':'reason_description'}, inplace=True)
#need to make the date column a datetime
procedures_df_100['procedure_date'] = pd.to_datetime(procedures_df_100['procedure_date'], errors='coerce')

procedures_df_100.head()


Unnamed: 0,procedure_date,STOP,patient_id,encounter_id,SYSTEM,procedure_code,procedure_description,procedure_cost,reason_code,reason_description
0,2023-09-26 00:36:40+00:00,2023-09-26T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,8be4f777-e319-7fa6-a795-808d2cdbba0f,http://snomed.info/sct,430193006,Medication reconciliation (procedure),239.94,,
1,2024-03-26 00:36:40+00:00,2024-03-26T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,7fe8c1c8-f019-976e-4a70-e49dbaec8cd5,http://snomed.info/sct,430193006,Medication reconciliation (procedure),215.7,,
2,2024-09-24 00:36:40+00:00,2024-09-24T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,1f72daa0-6d9d-4e3c-d317-031ab62f0431,http://snomed.info/sct,430193006,Medication reconciliation (procedure),215.7,,
3,2025-03-25 00:36:40+00:00,2025-03-25T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,12f7aeb1-70ca-ed6f-2c61-2fbb7ac53d33,http://snomed.info/sct,430193006,Medication reconciliation (procedure),215.7,,
4,2016-10-25 18:43:10+00:00,2016-10-25T18:58:10Z,8313967b-6432-89aa-090f-4d2b81558360,0ab8c1de-e107-474e-8c68-6fe49a046020,http://snomed.info/sct,252160004,Standard pregnancy test (procedure),4592.67,72892002.0,Normal pregnancy (finding)


In [9]:
# add in procedures that they were seen for
patient_procedures = pd.merge(
    patient_conditions,
    procedures_df_100[["patient_id", "encounter_id", "procedure_code", "procedure_description", "procedure_date","procedure_cost","reason_code","reason_description"]],
    on=["patient_id", "encounter_id"],
    how="left"
)

In [14]:
#clean observations file for merging
observations_df_100.rename(columns={'PATIENT': 'patient_id', 'ENCOUNTER': 'encounter_id', 'CODE':'observation_code' ,'DESCRIPTION': 'observation_description', 'DATE':'observation_date', 'VALUE':'observation_value','UNITS':'observation_units'}, inplace=True)
#need to make the date column a datetime
observations_df_100['observation_date'] = pd.to_datetime(observations_df_100['observation_date'], errors='coerce')
observations_df_100.head()

Unnamed: 0,observation_date,patient_id,encounter_id,CATEGORY,observation_code,observation_description,observation_value,observation_units,TYPE
0,2022-04-19 00:36:40+00:00,dbaa48b1-720a-f5f2-9360-ae6bb9321037,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,vital-signs,8302-2,Body Height,50.5,cm,numeric
1,2022-04-19 00:36:40+00:00,dbaa48b1-720a-f5f2-9360-ae6bb9321037,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,vital-signs,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,0.0,{score},numeric
2,2022-04-19 00:36:40+00:00,dbaa48b1-720a-f5f2-9360-ae6bb9321037,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,vital-signs,29463-7,Body Weight,3.6,kg,numeric
3,2022-04-19 00:36:40+00:00,dbaa48b1-720a-f5f2-9360-ae6bb9321037,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,vital-signs,77606-2,Weight-for-length Per age and sex,36.8,%,numeric
4,2022-04-19 00:36:40+00:00,dbaa48b1-720a-f5f2-9360-ae6bb9321037,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,vital-signs,8289-1,Head Occipital-frontal circumference Percentile,33.7,%,numeric


In [18]:
# Merge the most recent observations into the patient_procedures dataframe
patient_procedures_with_observations = pd.merge(
    patient_procedures,
    observations_df_100[['patient_id','encounter_id', 'observation_date', 'observation_code', 'observation_description', 'observation_value', 'observation_units']],
    on=["patient_id", "encounter_id"],
    how='left'
)
patient_procedures_with_observations.head()

Unnamed: 0,patient_id,BIRTHDATE,GENDER,BIRTHDATE.1,ZIP,INCOME,encounter_id,condition_code,condition_description,condition_start,...,procedure_description,procedure_date,procedure_cost,reason_code,reason_description,observation_date,observation_code,observation_description,observation_value,observation_units
0,dbaa48b1-720a-f5f2-9360-ae6bb9321037,2022-04-19,F,2022-04-19,1752,60469,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,314529007,Medication review due (situation),2022-04-19,...,,NaT,,,,2022-04-19 00:36:40+00:00,8302-2,Body Height,50.5,cm
1,dbaa48b1-720a-f5f2-9360-ae6bb9321037,2022-04-19,F,2022-04-19,1752,60469,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,314529007,Medication review due (situation),2022-04-19,...,,NaT,,,,2022-04-19 00:36:40+00:00,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,0.0,{score}
2,dbaa48b1-720a-f5f2-9360-ae6bb9321037,2022-04-19,F,2022-04-19,1752,60469,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,314529007,Medication review due (situation),2022-04-19,...,,NaT,,,,2022-04-19 00:36:40+00:00,29463-7,Body Weight,3.6,kg
3,dbaa48b1-720a-f5f2-9360-ae6bb9321037,2022-04-19,F,2022-04-19,1752,60469,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,314529007,Medication review due (situation),2022-04-19,...,,NaT,,,,2022-04-19 00:36:40+00:00,77606-2,Weight-for-length Per age and sex,36.8,%
4,dbaa48b1-720a-f5f2-9360-ae6bb9321037,2022-04-19,F,2022-04-19,1752,60469,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,314529007,Medication review due (situation),2022-04-19,...,,NaT,,,,2022-04-19 00:36:40+00:00,8289-1,Head Occipital-frontal circumference Percentile,33.7,%


## Now we have to merge medications and encounters to the dataframe

In [None]:
#rename columns for merging medications
medications_df_100.rename(columns={'PATIENT': 'patient_id', 'ENCOUNTER': 'encounter_id','PAYER':'payer_id', 'CODE':'medication_code' ,'DESCRIPTION': 'medication_description', 'START':'medication_start', 'BASE_COST':'medication_cost','PAYER_COVERAGE':'payer_coverage'}, inplace=True)

In [30]:
#rename columns in careplans for merging
careplans_df_100.rename(columns={'Id':'careplan_id','PATIENT': 'patient_id', 'ENCOUNTER': 'encounter_id','CODE':'careplan_code' ,'DESCRIPTION': 'careplan_description', 'START':'careplan_start',"REASONCODE":'careplan_reason_code',"REASONDESCRIPTION":'careplan_reason_description' }, inplace=True)
careplans_df_100.head()

Unnamed: 0,careplan_id,careplan_start,STOP,patient_id,encounter_id,careplan_code,careplan_description,careplan_reason_code,careplan_reason_description
0,a13c3a76-fd15-e00d-a6f7-2c3735fce001,2006-02-07,,8313967b-6432-89aa-090f-4d2b81558360,359aaf0b-6ce5-8a6d-ce7d-7dca74ab8a6c,718361005,Weight management program (regime/therapy),,
1,d7252d0d-0699-c4db-6978-b92fc97ccd13,2016-10-25,2017-05-30,8313967b-6432-89aa-090f-4d2b81558360,0ab8c1de-e107-474e-8c68-6fe49a046020,134435003,Routine antenatal care (regime/therapy),,
2,00fe4941-cebd-bbdc-c423-0f527155221d,2020-08-31,2020-10-11,8313967b-6432-89aa-090f-4d2b81558360,a7e46c3f-ee94-63bb-8e5d-d4f41233ab38,133901003,Burn care (regime/therapy),403190006.0,Epidermal burn of skin (disorder)
3,cf240348-ca10-2966-b25a-f1aad69c3731,2021-10-12,2022-05-24,8313967b-6432-89aa-090f-4d2b81558360,14813d1d-9f54-11d1-bb79-5b9201e9cc60,134435003,Routine antenatal care (regime/therapy),,
4,01bf593e-f898-3b0f-4d89-1403492059ea,2018-12-19,2019-01-28,c525e0a9-c37c-419c-db08-86080b4b774d,20b1f81d-9450-c725-8b7d-192dad4e02ea,773513001,Physiotherapy care plan (record artifact),44465007.0,Sprain of ankle (disorder)


In [55]:
#rename encounter columns for merging encounters
encounters_df_100.rename(columns={'Id':'encounter_id','PATIENT': 'patient_id', 'START': 'encounter_start',"PAYER":"payer_id" ,'END': 'encounter_end', 'CODE': 'encounter_code', 'DESCRIPTION': 'encounter_description', 'BASE_ENCOUNTER_COST': 'encounter_cost', 'ORGANIZATION': 'organization_id', 'PROVIDER': 'provider_id', "REASONCODE":"encounter_reason_code","REASONDESCRIPTION":"encounter_reason_description"}, inplace=True)
encounters_df_100.head()

Unnamed: 0,encounter_id,encounter_start,STOP,patient_id,organization_id,provider_id,payer_id,ENCOUNTERCLASS,encounter_code,encounter_description,encounter_cost,TOTAL_CLAIM_COST,PAYER_COVERAGE,encounter_reason_code,encounter_reason_description
0,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,2022-04-19T00:36:40Z,2022-04-19T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,347.38,0.0,,
1,1f391129-92f6-090b-d27e-48cdb42824c2,2022-05-24T00:36:40Z,2022-05-24T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,272.8,0.0,,
2,e6f28acc-e581-ab40-7f30-bd9e72a18e61,2022-07-26T00:36:40Z,2022-07-26T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,816.8,0.0,,
3,1b1ff37c-ddf8-96c0-4aef-7b0c01b2c845,2022-09-27T00:36:40Z,2022-09-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,816.8,0.0,,
4,8538767f-d67e-976f-ec4d-5f9ade23c2c6,2022-12-27T00:36:40Z,2022-12-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,816.8,0.0,,


In [60]:
# linking encounter info into this dataframe above
def create_patient_portfolio(patient_id):
    """Create a comprehensive patient portfolio by linking all relevant medical records."""
    # Validate patient exists
    if patient_id not in patients_df_100["patient_id"].values:
        raise ValueError(f"Patient ID {patient_id} not found in patients_df_100.")

    # Pre-filter all DataFrames for this patient (dramatically improves performance)
    patient_conditions = conditions_df_100[conditions_df_100["patient_id"] == patient_id]
    patient_procedures = procedures_df_100[procedures_df_100["patient_id"] == patient_id]
    patient_observations = observations_df_100[observations_df_100["patient_id"] == patient_id]
    patient_medications = medications_df_100[medications_df_100["patient_id"] == patient_id]
    patient_encounters = encounters_df_100[encounters_df_100["patient_id"] == patient_id]
    patient_careplans = careplans_df_100[careplans_df_100["patient_id"] == patient_id]

    # Initialize portfolio with demographics
    portfolio = {
        "demographics": patients_df_100[patients_df_100["patient_id"] == patient_id].iloc[0].to_dict(),
        "encounters": [],
        "careplans": []
    }

    # Process each encounter
    for _, encounter in patient_encounters.iterrows():
        enc_id = encounter["encounter_id"]
        
        encounter_entry = {
            "encounter_id": enc_id,
            "encounter_date": encounter["encounter_start"],
            "encounter_type": encounter["ENCOUNTERCLASS"],
            "encounter_code": encounter["encounter_code"],
            "encounter_description": encounter["encounter_description"],
            "encounter_cost": encounter["encounter_cost"],
            "encounter_hospital": encounter["organization_id"],
            "encounter_provider": encounter["provider_id"],
            "encounter_payer": encounter["payer_id"],
            "conditions": patient_conditions[patient_conditions["encounter_id"] == enc_id].to_dict('records'),
            "procedures": patient_procedures[patient_procedures["encounter_id"] == enc_id].to_dict('records'),
            "observations": patient_observations[patient_observations["encounter_id"] == enc_id].to_dict('records'),
            "medications": patient_medications[patient_medications["encounter_id"] == enc_id].to_dict('records')
        }
        portfolio["encounters"].append(encounter_entry)

    # Process care plans and link to conditions
    portfolio["careplans"] = patient_careplans.to_dict('records')
    
    for careplan in portfolio["careplans"]:
        careplan["linked_condition"] = patient_conditions[
            patient_conditions["condition_code"] == careplan["careplan_reason_code"]
        ].to_dict('records')

    return portfolio

In [68]:
import json

all_patients = patients_df_100["patient_id"].unique()
portfolios = {pid: create_patient_portfolio(pid) for pid in all_patients}

In [72]:
import json

# Define the output file path
output_file_path = '/tmp/patient_portfolios.json'

# Convert Timestamp objects to strings for JSON serialization
def convert_timestamps(obj):
    if isinstance(obj, pd.Timestamp):
        return obj.isoformat()
    if isinstance(obj, dict):
        return {k: convert_timestamps(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [convert_timestamps(i) for i in obj]
    return obj

# Save the portfolios dictionary to a JSON file
with open(output_file_path, 'w') as json_file:
    json.dump(convert_timestamps(portfolios), json_file, indent=4)

print(f"Patient portfolios saved to {output_file_path}")

Patient portfolios saved to /tmp/patient_portfolios.json
