In [3]:
# 0. Import necessary libraries
import requests
import zipfile
import os
import io
import pandas as pd


In [4]:
# 1. Download the file
url = "https://synthetichealth.github.io/synthea-sample-data/downloads/latest/synthea_sample_data_csv_latest.zip"
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful


In [5]:
# 2. Extract the ZIP file into a temporary directory
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    # Create a temporary directory to store the files
    extracted_path = '/tmp/synthea_sample_data'
    os.makedirs(extracted_path, exist_ok=True)
    zip_ref.extractall(extracted_path)
    print(f"Extracted to {extracted_path}")
    

Extracted to /tmp/synthea_sample_data


In [6]:
# 3. List files extracted
extracted_files = os.listdir(extracted_path)
print("Extracted files:", extracted_files)


Extracted files: ['medications.csv', 'providers.csv', 'payer_transitions.csv', 'imaging_studies.csv', 'supplies.csv', 'payers.csv', 'claims.csv', 'allergies.csv', 'procedures.csv', 'organizations.csv', 'conditions.csv', 'careplans.csv', 'encounters.csv', 'devices.csv', 'immunizations.csv', 'claims_transactions.csv', 'patients.csv', 'observations.csv']


In [9]:
# 4. Create DataFrames for each CSV file
csv_files = [
    "careplans.csv",
    "conditions.csv",
    "encounters.csv",
    "medications.csv",
    "observations.csv",
    "patients.csv",
    "procedures.csv",
    "payers.csv",
    "claims.csv",
    'organizations.csv',
    'providers.csv',
]

# Dictionary to store DataFrames
dataframes_100 = {}

# Read each CSV file into a unique DataFrame
for csv_file in csv_files:
    file_path = os.path.join(extracted_path, csv_file)
    if os.path.exists(file_path):
        # Load the CSV into a Pandas DataFrame
        dataframes_100[csv_file] = pd.read_csv(file_path)
        print(f"Loaded {csv_file}")
    else:
        print(f"{csv_file} not found in the extracted files.")

careplans_df_100 = dataframes_100.get('careplans.csv')
conditions_df_100 = dataframes_100.get('conditions.csv')
encounters_df_100 = dataframes_100.get('encounters.csv')
medications_df_100 = dataframes_100.get('medications.csv')
observations_df_100 = dataframes_100.get('observations.csv')
patients_df_100 = dataframes_100.get('patients.csv')
procedures_df_100 = dataframes_100.get('procedures.csv')
payers_df_100 = dataframes_100.get('payers.csv')
claims_df_100 = dataframes_100.get('claims.csv')
organizations_df_100 = dataframes_100.get('organizations.csv')
providers_df_100 = dataframes_100.get('providers.csv')


Loaded careplans.csv
Loaded conditions.csv
Loaded encounters.csv
Loaded medications.csv
Loaded observations.csv
Loaded patients.csv
Loaded procedures.csv
Loaded payers.csv
Loaded claims.csv
Loaded organizations.csv
Loaded providers.csv


In [18]:
for name, df in dataframes_100.items():
    print(f"{name.replace('.csv', '')}_df_100 shape: {df.shape}")
    

careplans_df_100 shape: (425, 9)
conditions_df_100 shape: (5738, 7)
encounters_df_100 shape: (11501, 15)
medications_df_100 shape: (12345, 13)
observations_df_100 shape: (151440, 9)
patients_df_100 shape: (129, 28)
procedures_df_100 shape: (26998, 10)
payers_df_100 shape: (10, 22)
claims_df_100 shape: (23846, 31)
organizations_df_100 shape: (281, 11)
providers_df_100 shape: (281, 13)


In [13]:
encounters_df_100.columns


Index(['Id', 'START', 'STOP', 'PATIENT', 'ORGANIZATION', 'PROVIDER', 'PAYER',
       'ENCOUNTERCLASS', 'CODE', 'DESCRIPTION', 'BASE_ENCOUNTER_COST',
       'TOTAL_CLAIM_COST', 'PAYER_COVERAGE', 'REASONCODE',
       'REASONDESCRIPTION'],
      dtype='object')

In [14]:
display(encounters_df_100)


Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
0,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,2022-04-19T00:36:40Z,2022-04-19T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,347.38,0.00,,
1,1f391129-92f6-090b-d27e-48cdb42824c2,2022-05-24T00:36:40Z,2022-05-24T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,272.80,0.00,,
2,e6f28acc-e581-ab40-7f30-bd9e72a18e61,2022-07-26T00:36:40Z,2022-07-26T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,816.80,0.00,,
3,1b1ff37c-ddf8-96c0-4aef-7b0c01b2c845,2022-09-27T00:36:40Z,2022-09-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,816.80,0.00,,
4,8538767f-d67e-976f-ec4d-5f9ade23c2c6,2022-12-27T00:36:40Z,2022-12-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.80,816.80,0.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11496,f541977f-7fb0-0636-9918-2e3a9b5d1da4,2023-11-30T02:07:21Z,2023-11-30T02:22:21Z,59be6f33-cd77-774a-8e0f-3286eff96d1c,497f39dd-280e-3d58-af5b-c5e3a3a09b10,60ce9967-05b7-382c-bcb8-680744263e28,a735bf55-83e9-331a-899d-a82a60b9f60c,ambulatory,185345009,Encounter for symptom (procedure),85.55,85.55,68.44,444814009.0,Viral sinusitis (disorder)
11497,233c6634-ad07-4ac6-4b6d-ec2e222f3949,2024-02-21T21:07:21Z,2024-02-21T21:22:21Z,59be6f33-cd77-774a-8e0f-3286eff96d1c,497f39dd-280e-3d58-af5b-c5e3a3a09b10,60ce9967-05b7-382c-bcb8-680744263e28,a735bf55-83e9-331a-899d-a82a60b9f60c,ambulatory,185345009,Encounter for symptom (procedure),85.55,85.55,0.00,444814009.0,Viral sinusitis (disorder)
11498,21626156-6a86-0585-dcf3-0d04013e54ac,2024-08-09T12:07:21Z,2024-08-09T12:47:07Z,59be6f33-cd77-774a-8e0f-3286eff96d1c,390a2aa4-70b4-3a64-a12f-9bd777834c8d,45b68013-84b2-3345-8c37-2a459f531e8d,a735bf55-83e9-331a-899d-a82a60b9f60c,wellness,162673000,General examination of patient (procedure),136.80,860.16,683.87,,
11499,776574af-90b9-9d10-1e57-764f9cef8108,2024-08-16T12:07:21Z,2024-08-16T12:22:21Z,59be6f33-cd77-774a-8e0f-3286eff96d1c,497f39dd-280e-3d58-af5b-c5e3a3a09b10,60ce9967-05b7-382c-bcb8-680744263e28,a735bf55-83e9-331a-899d-a82a60b9f60c,ambulatory,390906007,Follow-up encounter (procedure),85.55,234.71,187.76,55822004.0,Hyperlipidemia (disorder)


In [17]:
### Summary statistics of how many encounters per organization for each patient
encounters_df_100.shape

(11501, 15)

In [24]:
# Rename encounter columns for merging encounters
encounters_df_100.rename(columns={'Id': 'encounter_id', 'PATIENT': 'patient_id', 'START': 'encounter_start', 'STOP': 'encounter_stop', 'PAYER': 'payer_id', 'ENCOUNTERCLASS': 'encounter_class', 'END': 'encounter_end', 'CODE': 'encounter_code', 'DESCRIPTION': 'encounter_description', 'BASE_ENCOUNTER_COST': 'encounter_cost', 'ORGANIZATION': 'organization_id', 'PROVIDER': 'provider_id', 'REASONCODE': 'encounter_reason_code', 'REASONDESCRIPTION': 'encounter_reason_description'}, inplace = True)
encounters_df_100.head()


Unnamed: 0,encounter_id,encounter_start,encounter_stop,patient_id,organization_id,provider_id,payer_id,encounter_class,encounter_code,encounter_description,encounter_cost,TOTAL_CLAIM_COST,PAYER_COVERAGE,encounter_reason_code,encounter_reason_description
0,be04e13d-a16e-c330-3a9e-10f8db9a8aa1,2022-04-19T00:36:40Z,2022-04-19T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,347.38,0.0,,
1,1f391129-92f6-090b-d27e-48cdb42824c2,2022-05-24T00:36:40Z,2022-05-24T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,272.8,0.0,,
2,e6f28acc-e581-ab40-7f30-bd9e72a18e61,2022-07-26T00:36:40Z,2022-07-26T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,816.8,0.0,,
3,1b1ff37c-ddf8-96c0-4aef-7b0c01b2c845,2022-09-27T00:36:40Z,2022-09-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,816.8,0.0,,
4,8538767f-d67e-976f-ec4d-5f9ade23c2c6,2022-12-27T00:36:40Z,2022-12-27T00:51:40Z,dbaa48b1-720a-f5f2-9360-ae6bb9321037,907b1d2f-2bf8-31b5-abc2-14d6b75faa96,22ae92ae-29c7-3d31-b626-84f83995331e,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,410620009,Well child visit (procedure),136.8,816.8,0.0,,


In [29]:
# Group by patient and organization, count number of encounters
encounter_counts = (
    encounters_df_100
    .groupby(['patient_id', 'organization_id'])
    .size()
    .reset_index(name = 'org_encounter_count')
)

# Add total encounters per patient
encounter_counts['total_patient_encounters'] = (
    encounter_counts
    .groupby('patient_id')['org_encounter_count']
    .transform('sum')
)


In [30]:
encounter_counts

Unnamed: 0,patient_id,organization_id,org_encounter_count,total_patient_encounters
0,08969e90-844a-c907-4d3c-a4907951e35e,0592d4e5-a0fb-3143-83cd-8e5a6b066a96,8,535
1,08969e90-844a-c907-4d3c-a4907951e35e,0d24cc89-3256-330c-b36d-d8de8babef84,16,535
2,08969e90-844a-c907-4d3c-a4907951e35e,497f39dd-280e-3d58-af5b-c5e3a3a09b10,505,535
3,08969e90-844a-c907-4d3c-a4907951e35e,8ddbad9e-b5fc-3895-8a1c-17f32197a865,1,535
4,08969e90-844a-c907-4d3c-a4907951e35e,a036cdf2-bbc5-3ffd-987f-56e437fd8b74,5,535
...,...,...,...,...
467,f952d41f-c3ee-717c-4185-e02cd757ad7e,880fad59-9c38-3a21-a39f-ccc801502ab3,1,67
468,f952d41f-c3ee-717c-4185-e02cd757ad7e,a5577e5c-b886-37a0-8eee-8ff6feb7ea6b,50,67
469,fc65519a-cac6-cbd5-093c-b16158c8293f,655f2a4f-419c-39d3-8ed6-6c4117eb7e32,8,98
470,fc65519a-cac6-cbd5-093c-b16158c8293f,8ec2ca51-abb4-35dd-b90f-6c6aad6d2124,10,98


In [32]:
# View summary statistics
summary_stats_patient_org = encounter_counts['org_encounter_count'].describe()
print("Summary statistics for number of encounters by patient per organization:")
print(summary_stats_patient_org)


Summary statistics for number of encounters by patient per organization:
count    472.000000
mean      24.366525
std       79.189493
min        1.000000
25%        2.000000
50%        9.000000
75%       16.000000
max      804.000000
Name: org_encounter_count, dtype: float64


In [33]:
# Drop duplicates to keep one row per patient
patient_totals = encounter_counts[['patient_id', 'total_patient_encounters']].drop_duplicates()

# Now get summary statistics
summary_stats_patient = patient_totals['total_patient_encounters'].describe()
print(summary_stats_patient)


count    129.000000
mean      89.155039
std      153.754850
min        2.000000
25%       25.000000
50%       38.000000
75%       62.000000
max      813.000000
Name: total_patient_encounters, dtype: float64
