In [2]:
# 0. Import necessary libraries
import requests
import zipfile
import os
import io
import pandas as pd


In [3]:
# 1. Download the file
url = "https://synthetichealth.github.io/synthea-sample-data/downloads/latest/synthea_sample_data_csv_latest.zip"
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful


In [4]:
# 2. Extract the ZIP file into a temporary directory
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    # Create a temporary directory to store the files
    extracted_path = '/tmp/synthea_sample_data'
    os.makedirs(extracted_path, exist_ok=True)
    zip_ref.extractall(extracted_path)
    print(f"Extracted to {extracted_path}")
    

Extracted to /tmp/synthea_sample_data


In [5]:
# 3. List files extracted
extracted_files = os.listdir(extracted_path)
print("Extracted files:", extracted_files)


Extracted files: ['medications.csv', 'providers.csv', 'payer_transitions.csv', 'imaging_studies.csv', 'supplies.csv', 'payers.csv', 'claims.csv', 'allergies.csv', 'procedures.csv', 'organizations.csv', 'conditions.csv', 'careplans.csv', 'encounters.csv', 'devices.csv', 'immunizations.csv', 'claims_transactions.csv', 'patients.csv', 'observations.csv']


In [6]:
# 4. Create DataFrames for each CSV file
csv_files = [
    "careplans.csv",
    "conditions.csv",
    "encounters.csv",
    "medications.csv",
    "observations.csv",
    "patients.csv",
    "procedures.csv",
    "payers.csv",
    "claims.csv",
    'organizations.csv',
    'providers.csv',
]

# Dictionary to store DataFrames
dataframes_100 = {}

# Read each CSV file into a unique DataFrame
for csv_file in csv_files:
    file_path = os.path.join(extracted_path, csv_file)
    if os.path.exists(file_path):
        # Load the CSV into a Pandas DataFrame
        dataframes_100[csv_file] = pd.read_csv(file_path)
        print(f"Loaded {csv_file}")
    else:
        print(f"{csv_file} not found in the extracted files.")

careplans_df_100 = dataframes_100.get('careplans.csv')
conditions_df_100 = dataframes_100.get('conditions.csv')
encounters_df_100 = dataframes_100.get('encounters.csv')
medications_df_100 = dataframes_100.get('medications.csv')
observations_df_100 = dataframes_100.get('observations.csv')
patients_df_100 = dataframes_100.get('patients.csv')
procedures_df_100 = dataframes_100.get('procedures.csv')
payers_df_100 = dataframes_100.get('payers.csv')
claims_df_100 = dataframes_100.get('claims.csv')
organizations_df_100 = dataframes_100.get('organizations.csv')
providers_df_100 = dataframes_100.get('providers.csv')


Loaded careplans.csv
Loaded conditions.csv
Loaded encounters.csv
Loaded medications.csv
Loaded observations.csv
Loaded patients.csv
Loaded procedures.csv
Loaded payers.csv
Loaded claims.csv
Loaded organizations.csv
Loaded providers.csv


In [7]:
for name, df in dataframes_100.items():
    print(f"{name.replace('.csv', '')}_df_100 shape: {df.shape}")
    

careplans_df_100 shape: (425, 9)
conditions_df_100 shape: (5738, 7)
encounters_df_100 shape: (11501, 15)
medications_df_100 shape: (12345, 13)
observations_df_100 shape: (151440, 9)
patients_df_100 shape: (129, 28)
procedures_df_100 shape: (26998, 10)
payers_df_100 shape: (10, 22)
claims_df_100 shape: (23846, 31)
organizations_df_100 shape: (281, 11)
providers_df_100 shape: (281, 13)


In [8]:
encounters_df_100.columns


Index(['Id', 'START', 'STOP', 'PATIENT', 'ORGANIZATION', 'PROVIDER', 'PAYER',
       'ENCOUNTERCLASS', 'CODE', 'DESCRIPTION', 'BASE_ENCOUNTER_COST',
       'TOTAL_CLAIM_COST', 'PAYER_COVERAGE', 'REASONCODE',
       'REASONDESCRIPTION'],
      dtype='object')

In [9]:
display(encounters_df_100)


Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
0,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,2022-04-19T00:36:40Z,2022-04-19T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,347.38,0.00,,
1,1f391129-92f6-090b-d27e-48cdb42824c2,2022-05-24T00:36:40Z,2022-05-24T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,272.80,0.00,,
2,e6f28acc-e581-ab40-7f30-bd9e72a18e61,2022-07-26T00:36:40Z,2022-07-26T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,816.80,0.00,,
3,1b1ff37c-ddf8-96c0-4aef-7b0c01b2c845,2022-09-27T00:36:40Z,2022-09-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,816.80,0.00,,
4,8538767f-d67e-976f-ec4d-5f9ade23c2c6,2022-12-27T00:36:40Z,2022-12-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,816.80,0.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11496,f541977f-7fb0-0636-9918-2e3a9b5d1da4,2023-11-30T02:07:21Z,2023-11-30T02:22:21Z,59be6f33-cd77-774a-8e0f-3286eff96d1c,497f39dd-280e-3d58-af5b-c5e3a3a09b10,60ce9967-05b7-382c-bcb8-680744263e28,a735bf55-83e9-331a-899d-a82a60b9f60c,ambulatory,185345009,Encounter for symptom (procedure),85.55,85.55,68.44,444814009.0,Viral sinusitis (disorder)
11497,233c6634-ad07-4ac6-4b6d-ec2e222f3949,2024-02-21T21:07:21Z,2024-02-21T21:22:21Z,59be6f33-cd77-774a-8e0f-3286eff96d1c,497f39dd-280e-3d58-af5b-c5e3a3a09b10,60ce9967-05b7-382c-bcb8-680744263e28,a735bf55-83e9-331a-899d-a82a60b9f60c,ambulatory,185345009,Encounter for symptom (procedure),85.55,85.55,0.00,444814009.0,Viral sinusitis (disorder)
11498,21626156-6a86-0585-dcf3-0d04013e54ac,2024-08-09T12:07:21Z,2024-08-09T12:47:07Z,59be6f33-cd77-774a-8e0f-3286eff96d1c,390a2aa4-70b4-3a64-a12f-9bd777834c8d,45b68013-84b2-3345-8c37-2a459f531e8d,a735bf55-83e9-331a-899d-a82a60b9f60c,wellness,162673000,General examination of patient (procedure),136.80,860.16,683.87,,
11499,776574af-90b9-9d10-1e57-764f9cef8108,2024-08-16T12:07:21Z,2024-08-16T12:22:21Z,59be6f33-cd77-774a-8e0f-3286eff96d1c,497f39dd-280e-3d58-af5b-c5e3a3a09b10,60ce9967-05b7-382c-bcb8-680744263e28,a735bf55-83e9-331a-899d-a82a60b9f60c,ambulatory,390906007,Follow-up encounter (procedure),85.55,234.71,187.76,55822004.0,Hyperlipidemia (disorder)


In [10]:
### Summary statistics of how many encounters per organization for each patient
encounters_df_100.shape

(11501, 15)

In [11]:
# Rename encounter columns for merging encounters
encounters_df_100.rename(columns={'Id': 'encounter_id', 'PATIENT': 'patient_id', 'START': 'encounter_start', 'STOP': 'encounter_stop', 'PAYER': 'payer_id', 'ENCOUNTERCLASS': 'encounter_class', 'END': 'encounter_end', 'CODE': 'encounter_code', 'DESCRIPTION': 'encounter_description', 'BASE_ENCOUNTER_COST': 'encounter_cost', 'ORGANIZATION': 'organization_id', 'PROVIDER': 'provider_id', 'REASONCODE': 'encounter_reason_code', 'REASONDESCRIPTION': 'encounter_reason_description'}, inplace = True)
encounters_df_100.head()


Unnamed: 0,encounter_id,encounter_start,encounter_stop,patient_id,organization_id,provider_id,payer_id,encounter_class,encounter_code,encounter_description,encounter_cost,TOTAL_CLAIM_COST,PAYER_COVERAGE,encounter_reason_code,encounter_reason_description
0,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,2022-04-19T00:36:40Z,2022-04-19T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,347.38,0.0,,
1,1f391129-92f6-090b-d27e-48cdb42824c2,2022-05-24T00:36:40Z,2022-05-24T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,272.8,0.0,,
2,e6f28acc-e581-ab40-7f30-bd9e72a18e61,2022-07-26T00:36:40Z,2022-07-26T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,816.8,0.0,,
3,1b1ff37c-ddf8-96c0-4aef-7b0c01b2c845,2022-09-27T00:36:40Z,2022-09-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,816.8,0.0,,
4,8538767f-d67e-976f-ec4d-5f9ade23c2c6,2022-12-27T00:36:40Z,2022-12-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,816.8,0.0,,


In [12]:
# Group by patient and organization, count number of encounters
encounter_counts = (
    encounters_df_100
    .groupby(['patient_id', 'organization_id'])
    .size()
    .reset_index(name = 'org_encounter_count')
)

# Add total encounters per patient
encounter_counts['total_patient_encounters'] = (
    encounter_counts
    .groupby('patient_id')['org_encounter_count']
    .transform('sum')
)


In [13]:
encounter_counts

Unnamed: 0,patient_id,organization_id,org_encounter_count,total_patient_encounters
0,08969e90-844a-c907-4d3c-a4907951e35e,0592d4e5-a0fb-3143-83cd-8e5a6b066a96,8,535
1,08969e90-844a-c907-4d3c-a4907951e35e,0d24cc89-3256-330c-b36d-d8de8babef84,16,535
2,08969e90-844a-c907-4d3c-a4907951e35e,497f39dd-280e-3d58-af5b-c5e3a3a09b10,505,535
3,08969e90-844a-c907-4d3c-a4907951e35e,8ddbad9e-b5fc-3895-8a1c-17f32197a865,1,535
4,08969e90-844a-c907-4d3c-a4907951e35e,a036cdf2-bbc5-3ffd-987f-56e437fd8b74,5,535
...,...,...,...,...
467,f952d41f-c3ee-717c-4185-e02cd757ad7e,880fad59-9c38-3a21-a39f-ccc801502ab3,1,67
468,f952d41f-c3ee-717c-4185-e02cd757ad7e,a5577e5c-b886-37a0-8eee-8ff6feb7ea6b,50,67
469,fc65519a-cac6-cbd5-093c-b16158c8293f,655f2a4f-419c-39d3-8ed6-6c4117eb7e32,8,98
470,fc65519a-cac6-cbd5-093c-b16158c8293f,8ec2ca51-abb4-35dd-b90f-6c6aad6d2124,10,98


In [14]:
# View summary statistics
summary_stats_patient_org = encounter_counts['org_encounter_count'].describe()
print("Summary statistics for number of encounters by patient per organization:")
print(summary_stats_patient_org)


Summary statistics for number of encounters by patient per organization:
count    472.000000
mean      24.366525
std       79.189493
min        1.000000
25%        2.000000
50%        9.000000
75%       16.000000
max      804.000000
Name: org_encounter_count, dtype: float64


In [15]:
# Drop duplicates to keep one row per patient
patient_totals = encounter_counts[['patient_id', 'total_patient_encounters']].drop_duplicates()

# Now get summary statistics
summary_stats_patient = patient_totals['total_patient_encounters'].describe()
print(summary_stats_patient)


count    129.000000
mean      89.155039
std      153.754850
min        2.000000
25%       25.000000
50%       38.000000
75%       62.000000
max      813.000000
Name: total_patient_encounters, dtype: float64


#### Estimation of the agreggate cost of encounters by organization 
Total and average costs grouped by organization

In [28]:

costs_by_org = (
    encounters_df_100
    .groupby('organization_id')
    .agg(
        total_cost=('encounter_cost', 'sum'),
        average_cost=('encounter_cost', 'mean'),
        #number of encounter for each organization
        count=('encounter_id', 'count')
    )
    .reset_index()
    .sort_values(by='total_cost', ascending=False)
)

costs_by_org

Unnamed: 0,organization_id,total_cost,average_cost,count
70,497f39dd-280e-3d58-af5b-c5e3a3a09b10,124403.94,87.362317,1424
181,b8421363-9807-3b16-a146-95336eea5cfb,124265.00,86.959412,1429
203,d1d4d104-68cb-356b-9086-888de7d2826f,71137.27,88.479192,804
79,57e4e5fa-d68d-3caa-a5ad-37beb1b51207,64561.52,90.676292,712
206,d5920b72-ac1a-3ec4-81ca-e3b75cb7aadd,54423.34,87.779581,620
...,...,...,...,...
113,810c33cd-83fe-36e9-8fe6-ef78eb2c6fb9,110.92,110.920000,1
185,bdc3ee76-9cf3-316d-b202-a8da1ea3fa20,87.71,87.710000,1
188,c0132cf6-1703-3aad-89b4-cc2f2bbc123e,87.71,87.710000,1
122,880fad59-9c38-3a21-a39f-ccc801502ab3,87.71,87.710000,1


#### Estimation of the agreggate cost of encounters by the care plan reason code

In [18]:
# Rename key columns to merge encounters df with careplans df
careplans_df_100.rename(columns={
    'ENCOUNTER': 'encounter_id',
    'REASONCODE': 'careplan_reason_code',
    'REASONDESCRIPTION': 'careplan_reason_description'
}, inplace=True)

In [29]:
careplans_df_100

Unnamed: 0,Id,START,STOP,PATIENT,encounter_id,CODE,DESCRIPTION,careplan_reason_code,careplan_reason_description
0,a13c3a76-fd15-e00d-a6f7-2c3735fce001,2006-02-07,,8313967b-6432-89aa-090f-4d2b81558360,359aaf0b-6ce5-8a6d-ce7d-7dca74ab8a6c,718361005,Weight management program (regime/therapy),,
1,d7252d0d-0699-c4db-6978-b92fc97ccd13,2016-10-25,2017-05-30,8313967b-6432-89aa-090f-4d2b81558360,0ab8c1de-e107-474e-8c68-6fe49a046020,134435003,Routine antenatal care (regime/therapy),,
2,00fe4941-cebd-bbdc-c423-0f527155221d,2020-08-31,2020-10-11,8313967b-6432-89aa-090f-4d2b81558360,a7e46c3f-ee94-63bb-8e5d-d4f41233ab38,133901003,Burn care (regime/therapy),4.031900e+08,Epidermal burn of skin (disorder)
3,cf240348-ca10-2966-b25a-f1aad69c3731,2021-10-12,2022-05-24,8313967b-6432-89aa-090f-4d2b81558360,14813d1d-9f54-11d1-bb79-5b9201e9cc60,134435003,Routine antenatal care (regime/therapy),,
4,01bf593e-f898-3b0f-4d89-1403492059ea,2018-12-19,2019-01-28,c525e0a9-c37c-419c-db08-86080b4b774d,20b1f81d-9450-c725-8b7d-192dad4e02ea,773513001,Physiotherapy care plan (record artifact),4.446501e+07,Sprain of ankle (disorder)
...,...,...,...,...,...,...,...,...,...
420,cc947a1e-b3dc-0bda-1f12-b5c3ae331060,1966-03-25,,59be6f33-cd77-774a-8e0f-3286eff96d1c,46349625-c3f9-b5da-38e3-93a434e849b9,735985000,Diabetes self management plan (record artifact),7.146280e+08,Prediabetes (finding)
421,5461ce81-0b79-1273-6e79-9798b7c06809,1994-04-17,,59be6f33-cd77-774a-8e0f-3286eff96d1c,8376d569-12d3-1ce4-b1a5-9ae58d4f87a1,734163000,Care plan (record artifact),6.057300e+07,Aortic valve stenosis (disorder)
422,b3f6a258-501a-ec27-1986-0fb1ba009af2,1994-05-26,,59be6f33-cd77-774a-8e0f-3286eff96d1c,debd253d-6a5b-43f6-1b98-c439edf86c3a,736372004,Discharge care plan (record artifact),1.231000e+12,History of aortic valve replacement (situation)
423,08e1c507-ffca-ee00-3f5b-8633509e6428,2000-04-14,,59be6f33-cd77-774a-8e0f-3286eff96d1c,6fee159b-1494-9023-49bf-62748ff4e3c4,736285004,Hyperlipidemia clinical management plan (recor...,5.582200e+07,Hyperlipidemia (disorder)


In [20]:
# Merge encounters with careplans
encounters_with_careplan = pd.merge(
    encounters_df_100,
    careplans_df_100[['encounter_id', 'careplan_reason_code', 'careplan_reason_description']],
    on='encounter_id',
    how='left')  # We keep all encounters even if some have no careplan

In [30]:
# Group by careplan reason code and aggregate the costs
costs_by_reason_code = (
    encounters_with_careplan
    .groupby('careplan_reason_code')
    .agg(
        total_cost=('encounter_cost', 'sum'),
        number_of_encounters=('encounter_id', 'count')
    )
    .reset_index()
    .sort_values(by='total_cost', ascending=False))

# Merge back in the description
descriptions = (
    encounters_with_careplan[['careplan_reason_code', 'careplan_reason_description']]
    .drop_duplicates()
)
costs_by_reason_code = pd.merge(costs_by_reason_code, descriptions, on='careplan_reason_code', how='left')

In [31]:
costs_by_reason_code

Unnamed: 0,careplan_reason_code,total_cost,number_of_encounters,careplan_reason_description
0,714628000.0,8080.58,59,Prediabetes (finding)
1,59621000.0,5569.11,41,Essential hypertension (disorder)
2,55822000.0,1700.45,20,Hyperlipidemia (disorder)
3,88805010.0,1536.56,13,Chronic congestive heart failure (disorder)
4,44465010.0,1315.62,9,Sprain of ankle (disorder)
5,26929000.0,1048.93,8,Alzheimer's disease (disorder)
6,62106010.0,877.08,6,Concussion with no loss of consciousness (diso...
7,65966000.0,730.9,5,Fracture of forearm (disorder)
8,283371000.0,730.9,5,Laceration of forearm (disorder)
9,126906000.0,684.4,8,Neoplasm of prostate (disorder)
