In [1]:
# @title Synthea Patient Generator (Fixed Version)
import os
from IPython.display import clear_output

# Configuration
num_patients = 100  # @param {type:"integer"}
state = "Massachusetts"  # @param ["Massachusetts", "California", "New York", "Texas", "Florida"]
age_range = "30-85"  # @param {type:"string"}
seed = 12345  # @param {type:"integer"}

# Install Java
!sudo apt-get update
!sudo apt-get install -y openjdk-11-jdk-headless
clear_output()
print("✅ Java installed")

# Download Synthea
!wget -q https://github.com/synthetichealth/synthea/releases/download/master-branch-latest/synthea-with-dependencies.jar
clear_output()
print("✅ Synthea downloaded")

# Generate patients (using proper string substitution)
!java -jar synthea-with-dependencies.jar \
  -p {num_patients} \
  -s {seed} \
  -a "{age_range}" \
  --exporter.baseDirectory "./output" \
  --exporter.fhir.export=true \
  --exporter.csv.export=False \
  {state}

# Verify output
if os.path.exists("./output/fhir"):
    print(f"\n🎉 Success! Generated {len(os.listdir('./output/fhir'))} FHIR records")
    !ls -lh "./output/fhir" | head -5
else:
    print("\n❌ Generation failed. Common fixes:")
    print("1. Try reducing patient count (start with 10)")
    print("2. Check Java version:")
    !java -version
    print("3. Disk space:")
    !df -h

✅ Synthea downloaded
SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.
Scanned 88 modules and 152 submodules.
Loading submodule modules/allergies/allergy_panel.json
Loading submodule modules/allergies/drug_allergy_incidence.json
Loading submodule modules/allergies/environmental_allergy_incidence.json
Loading submodule modules/allergies/food_allergy_incidence.json
Loading submodule modules/allergies/immunotherapy.json
Loading submodule modules/allergies/outgrow_env_allergies.json
Loading submodule modules/allergies/outgrow_food_allergies.json
Loading submodule modules/allergies/severe_allergic_reaction.json
Loading submodule modules/anemia/anemia_sub.json
Loading submodule modules/breast_cancer/chemotherapy_breast.json
Loading submodule modules/breast_cancer/hormone_diagnosis.json
Loading submodule modules/breast_cancer/hormonetherapy_breast.json
Loading submodul

In [3]:
import json
import pandas as pd



# Load JSON file
with open("/content/output/fhir/Adolfo777_Jaskolski867_d172514a-6703-7f8e-b76d-a9ddc6e49a5b.json", "r") as f:
    data = json.load(f)

# Extract patient entries
patients = [
    entry["resource"] for entry in data["entry"]
    if entry["resource"]["resourceType"] == "Patient"
]

# Transform to a structured list
patient_records = []
for patient in patients:
    patient_records.append({
        "id": patient.get("id"),
        "name": " ".join(patient["name"][0].get("given", [])) + " " + patient["name"][0].get("family", ""),
        "gender": patient.get("gender"),
        "birthDate": patient.get("birthDate"),
        "race": next((ext["extension"][1]["valueString"]
                     for ext in patient.get("extension", [])
                     if "us-core-race" in ext.get("url", "")), None),
        "ethnicity": next((ext["extension"][1]["valueString"]
                          for ext in patient.get("extension", [])
                          if "us-core-ethnicity" in ext.get("url", "")), None),
        "birthSex": next((ext.get("valueCode")
                         for ext in patient.get("extension", [])
                         if "us-core-birthsex" in ext.get("url", "")), None),
        "address": ", ".join(patient.get("address", [{}])[0].get("line", [])),
        "city": patient.get("address", [{}])[0].get("city", ""),
        "state": patient.get("address", [{}])[0].get("state", ""),
        "postalCode": patient.get("address", [{}])[0].get("postalCode", ""),
        "phone": next((tel["value"] for tel in patient.get("telecom", []) if tel["system"] == "phone"), None)
    })

# Convert to DataFrame
df_patients = pd.DataFrame(patient_records)

print(df_patients.head())

                                     id                    name gender  \
0  d172514a-6703-7f8e-b76d-a9ddc6e49a5b  Adolfo777 Jaskolski867   male   

    birthDate   race               ethnicity birthSex  \
0  1981-08-11  White  Not Hispanic or Latino        M   

                     address       city state postalCode         phone  
0  327 Jerde Common Suite 22  Westfield    MA      01086  555-149-9272  


In [17]:
import os
import glob
import json
import pandas as pd

data_path = "/content/output/fhir/*.json"

patients, encounters, procedures, conditions, medications, costs, observations, providers, organizations, payers = [], [], [], [], [], [], [], [], [], []

# Loop through each file
for filepath in glob.glob(data_path):
    with open(filepath, "r") as f:
        data = json.load(f)

    for entry in data["entry"]:
        resource = entry["resource"]
        res_type = resource["resourceType"]

        if res_type == "Patient":
            patients.append({
                "patient_id": resource.get("id"),
                "name": " ".join(resource["name"][0].get("given", [])) + " " + resource["name"][0].get("family", ""),
                "gender": resource.get("gender"),
                "birthDate": resource.get("birthDate"),
                "race": next((ext["extension"][1]["valueString"]
                              for ext in resource.get("extension", [])
                              if "us-core-race" in ext.get("url", "")), None),
                "ethnicity": next((ext["extension"][1]["valueString"]
                                   for ext in resource.get("extension", [])
                                   if "us-core-ethnicity" in ext.get("url", "")), None),
                "birthSex": next((ext.get("valueCode")
                                  for ext in resource.get("extension", [])
                                  if "us-core-birthsex" in ext.get("url", "")), None),
                "address": ", ".join(resource.get("address", [{}])[0].get("line", [])),
                "city": resource.get("address", [{}])[0].get("city", ""),
                "state": resource.get("address", [{}])[0].get("state", ""),
                "postalCode": resource.get("address", [{}])[0].get("postalCode", ""),
                "phone": next((tel["value"] for tel in resource.get("telecom", []) if tel["system"] == "phone"), None)
            })

        elif res_type == "Encounter":
            # Get service provider info
            service_provider_ref = resource.get("serviceProvider", {}).get("reference", "")
            org_id = None
            if "Organization/" in service_provider_ref:
                org_id = service_provider_ref.split("Organization/")[-1]
            elif "?" in service_provider_ref:  # Handle query format
                parts = service_provider_ref.split("|")
                if len(parts) > 1:
                    org_id = parts[-1]

            # Get participant info
            participant_info = {
                "provider_id": None,
                "provider_name": None,
                "provider_npi": None
            }

            participants = resource.get("participant", [])
            if participants:
                # Get first participant (typically the primary provider)
                participant = participants[0]
                individual_ref = participant.get("individual", {}).get("reference", "")
                provider_name = participant.get("individual", {}).get("display", "")

                # Extract provider ID and NPI
                if "Practitioner/" in individual_ref:
                    participant_info["provider_id"] = individual_ref.split("Practitioner/")[-1]
                elif "?" in individual_ref:  # Handle query format
                    parts = individual_ref.split("|")
                    if len(parts) > 1:
                        participant_info["provider_npi"] = parts[-1]
                        participant_info["provider_id"] = parts[-1]  # Use NPI as ID if available

                participant_info["provider_name"] = provider_name


            encounters.append({
                "encounter_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "org_id": org_id,
                "provider_id": participant_info["provider_id"],
                "provider_name": participant_info["provider_name"],
                "provider_npi": participant_info["provider_npi"],
                "status": resource.get("status"),
                "type": resource.get("type", [{}])[0].get("text"),
                "start": resource.get("period", {}).get("start"),
                "end": resource.get("period", {}).get("end"),
                "location": resource.get("location", [{}])[0].get("location", {}).get("display")
            })

        elif res_type == "Procedure":
            procedures.append({
                "procedure_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "code": resource.get("code", {}).get("text"),
                "status": resource.get("status"),
                "performed": resource.get("performedDateTime")
            })

        elif res_type == "Condition":
            conditions.append({
                "condition_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "code": resource.get("code", {}).get("text"),
                "clinicalStatus": resource.get("clinicalStatus", {}).get("text"),
                "onsetDateTime": resource.get("onsetDateTime")
            })

        elif res_type == "MedicationRequest":
            medications.append({
                "med_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "medication": resource.get("medicationCodeableConcept", {}).get("text"),
                "status": resource.get("status"),
                "intent": resource.get("intent"),
                "authoredOn": resource.get("authoredOn")
            })

        elif res_type == "Claim":
            for item in resource.get("item", []):
                costs.append({
                    "claim_id": resource.get("id"),
                    "patient_id": resource.get("patient", {}).get("reference", "").split(":")[-1],
                    "productOrService": item.get("productOrService", {}).get("text"),
                    "amount": item.get("net", {}).get("value"),
                    "currency": item.get("net", {}).get("currency")
                })

        elif res_type == "Observation":
            observations.append({
                "observation_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "code": resource.get("code", {}).get("text"),
                "status": resource.get("status"),
                "value": resource.get("valueQuantity", {}).get("value"),
                "unit": resource.get("valueQuantity", {}).get("unit"),
                "effectiveDateTime": resource.get("effectiveDateTime")
            })

        elif res_type == "Organization":
            organizations.append({
                "organization_id": resource.get("id"),
                "name": resource.get("name"),
                "type": resource.get("type", [{}])[0].get("text", None),
                "address": ", ".join(resource.get("address", [{}])[0].get("line", [])),
                "city": resource.get("address", [{}])[0].get("city", ""),
                "state": resource.get("address", [{}])[0].get("state", ""),
                "postalCode": resource.get("address", [{}])[0].get("postalCode", "")[:5],
                "country": resource.get("address", [{}])[0].get("country", ""),
                "phone": next(
                    (telecom["value"] for telecom in resource.get("telecom", []) if telecom["system"] == "phone"), None
                )
            })



# Convert lists to DataFrames
df_patients = pd.DataFrame(patients)
df_encounters = pd.DataFrame(encounters).merge(df_patients, on="patient_id", how="left")
df_procedures = pd.DataFrame(procedures).merge(df_patients, on="patient_id", how="left")
df_conditions = pd.DataFrame(conditions).merge(df_patients, on="patient_id", how="left")
df_medications = pd.DataFrame(medications).merge(df_patients, on="patient_id", how="left")
df_costs = pd.DataFrame(costs).merge(df_patients, on="patient_id", how="left")
df_observations = pd.DataFrame(observations).merge(df_patients, on="patient_id", how="left")
df_organizations = pd.DataFrame(organizations)
df_payers = pd.DataFrame(payers)



# Merge organization info with encounters
df_encounters = df_encounters.merge(df_organizations, left_on="org_id", right_on="organization_id", how="left", suffixes=('', '_org'))


#merge payers with encounters
# df_encounters = df_encounters.merge(df_payers, on=["patient_id", "claim_id"], how="left"
# )

# # Alternative: Merge by patient_id only (if claim_id isn't reliable)
# df_encounters = df_encounters.merge(
#     df_payers.drop_duplicates("patient_id"),
#     on="patient_id",
#     how="left"
# )



# Preview merged data
print("Patients:", df_patients.shape)
print("Encounters:", df_encounters.shape)
print("Medications:", df_medications.shape)
print("Conditions:", df_conditions.shape)
print("Procedures:", df_procedures.shape)
print("Observations:", df_observations.shape)
print("Costs:", df_costs.shape)
print("Organizations:", df_organizations.shape)

Patients: (128, 12)
Encounters: (8242, 31)
Medications: (8138, 17)
Conditions: (5736, 16)
Procedures: (25270, 16)
Observations: (79664, 18)
Costs: (64153, 16)
Organizations: (330, 9)


In [21]:
df_encounters.columns

Index(['encounter_id', 'patient_id', 'org_id', 'provider_id', 'provider_name',
       'provider_npi', 'status', 'type', 'start', 'end', 'location', 'name',
       'gender', 'birthDate', 'race', 'ethnicity', 'birthSex', 'address',
       'city', 'state', 'postalCode', 'phone', 'organization_id', 'name_org',
       'type_org', 'address_org', 'city_org', 'state_org', 'postalCode_org',
       'country', 'phone_org'],
      dtype='object')

In [23]:
df_organizations

Unnamed: 0,organization_id,name,type,address,city,state,postalCode,country,phone
0,74ab949d-17ac-3309-83a0-13b4405c66aa,Fitchburg Outpatient Clinic,Healthcare Provider,881 Main Street,Fitchburg,MA,01420,US,978-342-9781 Or 978-342-9781
1,17a4bae5-8b64-34d7-8144-b428be027bd0,NURSE ON CALL,Healthcare Provider,"512 MAIN STREET, SUITE 211",SHREWSBURY,MA,01545,US,5088451232
2,4112b8b1-59df-3255-a7ca-f42ee0a4cb2e,CAPE HERITAGE REHABILITATION & HEALTH CARE CENTER,Healthcare Provider,37 ROUTE 6A,SANDWICH,MA,02563,US,5088888222
3,ef5390b4-cba5-3d83-96db-af783a7adb43,MERRIMACK VALLEY HOSPICE INC,Healthcare Provider,360 MERRIMACK ST,MALDEN,MA,02148,US,9785524000
4,77645e49-3f69-3d1d-bb93-dc65210e2fac,SOUTHCOAST HOSPITALS GROUP INC,Healthcare Provider,363 HIGHLAND AVE,FALL RIVER,MA,02720,US,5086793131
...,...,...,...,...,...,...,...,...,...
325,c67ca7ed-4769-3b6c-a0a9-993f1227ff10,CAPE ANN MEDICAL CENTER LLC,Healthcare Provider,1 BLACKBURN DR,GLOUCESTER,MA,01930,US,9782811500
326,ba07b31b-bf14-39b8-be95-4c5ca5062daa,SPAULDING HOSPITAL FOR CONTINUING MED CARE-CAMB,Healthcare Provider,1575 CAMBRIDGE ST,CAMBRIDGE,MA,02138,US,6178764344
327,6aae7a31-90df-3455-ad8d-81f8cf2d21e8,"BROCKTON HOSPITAL, INC.",Healthcare Provider,680 CENTRE ST,BROCKTON,MA,02302,US,5089417000
328,14036728-b721-3e4b-8755-8ee8ab6ed15e,"LONGFELLOW PRIMARY CARE, PC",Healthcare Provider,625 MOUNT AUBURN ST,CAMBRIDGE,MA,02138,US,6174924545


In [24]:
df_payers

In [25]:
import os
import glob
import json
import pandas as pd

data_path = "/content/output/fhir/*.json"

patients, encounters, procedures, conditions, medications, claims, observations, providers, organizations, payers = [], [], [], [], [], [], [], [], [], []

# Loop through each file
for filepath in glob.glob(data_path):
    with open(filepath, "r") as f:
        data = json.load(f)

    for entry in data["entry"]:
        resource = entry["resource"]
        res_type = resource["resourceType"]

        if res_type == "Patient":
            patients.append({
                "patient_id": resource.get("id"),
                "name": " ".join(resource["name"][0].get("given", [])) + " " + resource["name"][0].get("family", ""),
                "gender": resource.get("gender"),
                "birthDate": resource.get("birthDate"),
                "race": next((ext["extension"][1]["valueString"]
                              for ext in resource.get("extension", [])
                              if "us-core-race" in ext.get("url", "")), None),
                "ethnicity": next((ext["extension"][1]["valueString"]
                                   for ext in resource.get("extension", [])
                                   if "us-core-ethnicity" in ext.get("url", "")), None),
                "birthSex": next((ext.get("valueCode")
                                  for ext in resource.get("extension", [])
                                  if "us-core-birthsex" in ext.get("url", "")), None),
                "address": ", ".join(resource.get("address", [{}])[0].get("line", [])),
                "city": resource.get("address", [{}])[0].get("city", ""),
                "state": resource.get("address", [{}])[0].get("state", ""),
                "postalCode": resource.get("address", [{}])[0].get("postalCode", ""),
                "phone": next((tel["value"] for tel in resource.get("telecom", []) if tel["system"] == "phone"), None)
            })

        elif res_type == "Encounter":
            # Get service provider info
            service_provider_ref = resource.get("serviceProvider", {}).get("reference", "")
            org_id = None
            if "Organization/" in service_provider_ref:
                org_id = service_provider_ref.split("Organization/")[-1]
            elif "?" in service_provider_ref:  # Handle query format
                parts = service_provider_ref.split("|")
                if len(parts) > 1:
                    org_id = parts[-1]

            # Get participant info
            participant_info = {
                "provider_id": None,
                "provider_name": None,
                "provider_npi": None
            }

            participants = resource.get("participant", [])
            if participants:
                # Get first participant (typically the primary provider)
                participant = participants[0]
                individual_ref = participant.get("individual", {}).get("reference", "")
                provider_name = participant.get("individual", {}).get("display", "")

                # Extract provider ID and NPI
                if "Practitioner/" in individual_ref:
                    participant_info["provider_id"] = individual_ref.split("Practitioner/")[-1]
                elif "?" in individual_ref:  # Handle query format
                    parts = individual_ref.split("|")
                    if len(parts) > 1:
                        participant_info["provider_npi"] = parts[-1]
                        participant_info["provider_id"] = parts[-1]  # Use NPI as ID if available

                participant_info["provider_name"] = provider_name

            encounters.append({
                "encounter_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "org_id": org_id,
                "provider_id": participant_info["provider_id"],
                "provider_name": participant_info["provider_name"],
                "provider_npi": participant_info["provider_npi"],
                "status": resource.get("status"),
                "type": resource.get("type", [{}])[0].get("text"),
                "start": resource.get("period", {}).get("start"),
                "end": resource.get("period", {}).get("end"),
                "location": resource.get("location", [{}])[0].get("location", {}).get("display")
            })

        elif res_type == "Procedure":
            procedures.append({
                "procedure_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "code": resource.get("code", {}).get("text"),
                "status": resource.get("status"),
                "performed": resource.get("performedDateTime")
            })

        elif res_type == "Condition":
            conditions.append({
                "condition_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "code": resource.get("code", {}).get("text"),
                "clinicalStatus": resource.get("clinicalStatus", {}).get("text"),
                "onsetDateTime": resource.get("onsetDateTime")
            })

        elif res_type == "MedicationRequest":
            medications.append({
                "med_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "medication": resource.get("medicationCodeableConcept", {}).get("text"),
                "status": resource.get("status"),
                "intent": resource.get("intent"),
                "authoredOn": resource.get("authoredOn")
            })

        elif res_type == "Claim":
            for item in resource.get("item", []):
                claims.append({
                    "claim_id": resource.get("id"),
                    "patient_id": resource.get("patient", {}).get("reference", "").split(":")[-1],
                    "productOrService": item.get("productOrService", {}).get("text"),
                    "amount": item.get("net", {}).get("value"),
                    "currency": item.get("net", {}).get("currency")
                })

        elif res_type == "Observation":
            observations.append({
                "observation_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "code": resource.get("code", {}).get("text"),
                "status": resource.get("status"),
                "value": resource.get("valueQuantity", {}).get("value"),
                "unit": resource.get("valueQuantity", {}).get("unit"),
                "effectiveDateTime": resource.get("effectiveDateTime")
            })

        elif res_type == "Organization":
            organizations.append({
                "organization_id": resource.get("id"),
                "name": resource.get("name"),
                "type": resource.get("type", [{}])[0].get("text", None),
                "address": ", ".join(resource.get("address", [{}])[0].get("line", [])),
                "city": resource.get("address", [{}])[0].get("city", ""),
                "state": resource.get("address", [{}])[0].get("state", ""),
                "postalCode": resource.get("address", [{}])[0].get("postalCode", "")[:5],
                "country": resource.get("address", [{}])[0].get("country", ""),
                "phone": next(
                    (telecom["value"] for telecom in resource.get("telecom", []) if telecom["system"] == "phone"), None
                )
            })

        elif res_type == "ExplanationOfBenefit":
            # Extract payer information
            patient_id = resource.get("patient", {}).get("reference", "").split(":")[-1]
            claim_id = resource.get("id")

            # Get insurer (payer) information
            insurer_display = resource.get("insurer", {}).get("display", "Unknown")

            # Get coverage information
            coverage_display = None
            insurance = resource.get("insurance", [])
            if insurance:
                coverage_display = insurance[0].get("coverage", {}).get("display", insurer_display)

            # Get total claim amount
            total_amount = None
            total = resource.get("total", [])
            if total:
                total_amount = total[0].get("amount", {}).get("value")

            payers.append({
                "patient_id": patient_id,
                "claim_id": claim_id,
                "payer_name": insurer_display,
                "coverage_name": coverage_display,
                "claim_total": total_amount,
                "claim_currency": "USD"  # Assuming USD as default
            })

# Convert lists to DataFrames
df_patients = pd.DataFrame(patients)
df_encounters = pd.DataFrame(encounters)
df_procedures = pd.DataFrame(procedures)
df_conditions = pd.DataFrame(conditions)
df_medications = pd.DataFrame(medications)
df_costs = pd.DataFrame(costs)
df_observations = pd.DataFrame(observations)
df_organizations = pd.DataFrame(organizations)
df_payers = pd.DataFrame(payers)

# Merge organization info with encounters
df_encounters = df_encounters.merge(
    df_organizations,
    left_on="org_id",
    right_on="organization_id",
    how="left",
    suffixes=('', '_org')
)

# Merge payer info with encounters
# First try to merge by both patient_id and claim_id (if claim_id is available)
if 'claim_id' in df_encounters.columns:
    df_encounters = df_encounters.merge(
        df_payers,
        on=["patient_id", "claim_id"],
        how="left"
    )
else:
    # Fallback: merge by patient_id only (most recent payer)
    df_recent_payers = df_payers.sort_values("claim_id").drop_duplicates("patient_id", keep="last")
    df_encounters = df_encounters.merge(
        df_recent_payers,
        on="patient_id",
        how="left"
    )

# Preview merged data
print("Patients:", df_patients.shape)
print("Encounters:", df_encounters.shape)
print("Medications:", df_medications.shape)
print("Conditions:", df_conditions.shape)
print("Procedures:", df_procedures.shape)
print("Observations:", df_observations.shape)
print("Costs:", df_costs.shape)
print("Organizations:", df_organizations.shape)
print("Payers:", df_payers.shape)

Patients: (128, 12)
Encounters: (8242, 25)
Medications: (8138, 6)
Conditions: (5736, 5)
Procedures: (25270, 5)
Observations: (79664, 7)
Costs: (64153, 5)
Organizations: (330, 9)
Payers: (16380, 6)


In [32]:
df_encounters.columns

Index(['encounter_id', 'patient_id', 'org_id', 'provider_id', 'provider_name',
       'provider_npi', 'status', 'type', 'start', 'end', 'location',
       'organization_id', 'name', 'type_org', 'address', 'city', 'state',
       'postalCode', 'country', 'phone', 'claim_id', 'payer_name',
       'coverage_name', 'claim_total', 'claim_currency'],
      dtype='object')

In [49]:
df_conditions

Unnamed: 0,condition_id,patient_id,code,clinicalStatus,onsetDateTime,encounter_id,start
0,2af4d2c1-3e15-2daa-1da3-bb75ec2e011d,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Received higher education (finding),,1971-09-01T22:06:13+00:00,9b20d68a-2d93-35fd-3cc1-208852447d6a,1971-09-01T21:24:38+00:00
1,2af4d2c1-3e15-2daa-1da3-bb75ec2e011d,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Received higher education (finding),,1971-09-01T22:06:13+00:00,347024ae-7b9d-91bf-38a6-fb7a43b9586f,1972-09-06T21:24:38+00:00
2,2af4d2c1-3e15-2daa-1da3-bb75ec2e011d,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Received higher education (finding),,1971-09-01T22:06:13+00:00,25ff653d-a982-7781-ba4d-e22cef8d6d2c,1973-02-12T21:24:38+00:00
3,2af4d2c1-3e15-2daa-1da3-bb75ec2e011d,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Received higher education (finding),,1971-09-01T22:06:13+00:00,2110d365-6cac-c474-a88b-30270ec16ef3,1975-09-10T21:24:38+00:00
4,2af4d2c1-3e15-2daa-1da3-bb75ec2e011d,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Received higher education (finding),,1971-09-01T22:06:13+00:00,3aac4b9d-9f6e-3642-7f01-827717b36c7b,1976-07-21T21:24:38+00:00
...,...,...,...,...,...,...,...
670599,b018d6d4-6190-7c3f-4131-ebaceca7de96,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Victim of intimate partner abuse (finding),,1992-04-28T17:47:40+00:00,4f0a99a0-4087-ee29-3a84-ab2c24e142aa,1991-12-31T12:53:41+00:00
670600,b018d6d4-6190-7c3f-4131-ebaceca7de96,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Victim of intimate partner abuse (finding),,1992-04-28T17:47:40+00:00,f3fb6aaa-bf7b-082f-7b58-ca4b1431e820,1992-04-28T16:53:41+00:00
670601,b018d6d4-6190-7c3f-4131-ebaceca7de96,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Victim of intimate partner abuse (finding),,1992-04-28T17:47:40+00:00,117077fa-892f-d256-cae4-d6b392b24773,1992-09-01T16:53:41+00:00
670602,b018d6d4-6190-7c3f-4131-ebaceca7de96,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Victim of intimate partner abuse (finding),,1992-04-28T17:47:40+00:00,1ace25da-de64-a450-ed65-1c112df6845a,1992-09-08T16:53:41+00:00


In [50]:
df_observations

Unnamed: 0,observation_id,patient_id,code,status,value,unit,effectiveDateTime,encounter_id,start
0,d98b4995-555b-62fb-b563-297d26ec253c,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Hemoglobin A1c/Hemoglobin.total in Blood,final,5.91,%,2015-09-16T21:24:38+00:00,9b20d68a-2d93-35fd-3cc1-208852447d6a,1971-09-01T21:24:38+00:00
1,d98b4995-555b-62fb-b563-297d26ec253c,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Hemoglobin A1c/Hemoglobin.total in Blood,final,5.91,%,2015-09-16T21:24:38+00:00,347024ae-7b9d-91bf-38a6-fb7a43b9586f,1972-09-06T21:24:38+00:00
2,d98b4995-555b-62fb-b563-297d26ec253c,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Hemoglobin A1c/Hemoglobin.total in Blood,final,5.91,%,2015-09-16T21:24:38+00:00,25ff653d-a982-7781-ba4d-e22cef8d6d2c,1973-02-12T21:24:38+00:00
3,d98b4995-555b-62fb-b563-297d26ec253c,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Hemoglobin A1c/Hemoglobin.total in Blood,final,5.91,%,2015-09-16T21:24:38+00:00,2110d365-6cac-c474-a88b-30270ec16ef3,1975-09-10T21:24:38+00:00
4,d98b4995-555b-62fb-b563-297d26ec253c,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Hemoglobin A1c/Hemoglobin.total in Blood,final,5.91,%,2015-09-16T21:24:38+00:00,3aac4b9d-9f6e-3642-7f01-827717b36c7b,1976-07-21T21:24:38+00:00
...,...,...,...,...,...,...,...,...,...
18405418,9d41aca9-d871-b23b-dd31-823c7b6a8440,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Cause of Death [US Standard Certificate of Death],final,,,1992-09-22T16:53:41+00:00,4f0a99a0-4087-ee29-3a84-ab2c24e142aa,1991-12-31T12:53:41+00:00
18405419,9d41aca9-d871-b23b-dd31-823c7b6a8440,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Cause of Death [US Standard Certificate of Death],final,,,1992-09-22T16:53:41+00:00,f3fb6aaa-bf7b-082f-7b58-ca4b1431e820,1992-04-28T16:53:41+00:00
18405420,9d41aca9-d871-b23b-dd31-823c7b6a8440,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Cause of Death [US Standard Certificate of Death],final,,,1992-09-22T16:53:41+00:00,117077fa-892f-d256-cae4-d6b392b24773,1992-09-01T16:53:41+00:00
18405421,9d41aca9-d871-b23b-dd31-823c7b6a8440,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Cause of Death [US Standard Certificate of Death],final,,,1992-09-22T16:53:41+00:00,1ace25da-de64-a450-ed65-1c112df6845a,1992-09-08T16:53:41+00:00


In [54]:
# prompt: using the costs dataframe, create a total costs per pation column that sums the "amount" for each patient

df_costs['total_costs_per_patient'] = df_costs.groupby('patient_id')['amount'].transform('sum')
df_costs

Unnamed: 0,claim_id,patient_id,productOrService,amount,currency,total_costs_per_patient
0,e664a3b8-fc13-8f93-6e61-f3f6821125f3,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,General examination of patient (procedure),,,73037.21
1,e664a3b8-fc13-8f93-6e61-f3f6821125f3,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Received higher education (finding),,,73037.21
2,e664a3b8-fc13-8f93-6e61-f3f6821125f3,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Lack of access to transportation (finding),,,73037.21
3,e28fe5fd-0a21-35a5-feb1-42afa5bc16e5,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,General examination of patient (procedure),,,73037.21
4,e28fe5fd-0a21-35a5-feb1-42afa5bc16e5,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Transport problem (finding),,,73037.21
...,...,...,...,...,...,...
64148,d77ffcca-e7ae-0b04-830f-8325efcaff11,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Hospice care (regime/therapy),431.40,USD,159193.46
64149,d77ffcca-e7ae-0b04-830f-8325efcaff11,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Hospice care (regime/therapy),431.40,USD,159193.46
64150,d77ffcca-e7ae-0b04-830f-8325efcaff11,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Hospice care (regime/therapy),431.40,USD,159193.46
64151,544f0ad7-705e-7c74-c194-14e4f6b2e116,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Death Certification,,,159193.46


In [55]:
df_procedures

Unnamed: 0,procedure_id,patient_id,code,status,performed,encounter_id,start
0,f9a285ec-91e3-e6b6-6732-afb23e40cf92,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Assessment of health and social care needs (pr...,completed,,9b20d68a-2d93-35fd-3cc1-208852447d6a,1971-09-01T21:24:38+00:00
1,f9a285ec-91e3-e6b6-6732-afb23e40cf92,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Assessment of health and social care needs (pr...,completed,,347024ae-7b9d-91bf-38a6-fb7a43b9586f,1972-09-06T21:24:38+00:00
2,f9a285ec-91e3-e6b6-6732-afb23e40cf92,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Assessment of health and social care needs (pr...,completed,,25ff653d-a982-7781-ba4d-e22cef8d6d2c,1973-02-12T21:24:38+00:00
3,f9a285ec-91e3-e6b6-6732-afb23e40cf92,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Assessment of health and social care needs (pr...,completed,,2110d365-6cac-c474-a88b-30270ec16ef3,1975-09-10T21:24:38+00:00
4,f9a285ec-91e3-e6b6-6732-afb23e40cf92,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,Assessment of health and social care needs (pr...,completed,,3aac4b9d-9f6e-3642-7f01-827717b36c7b,1976-07-21T21:24:38+00:00
...,...,...,...,...,...,...,...
3749313,e8552e9c-e0aa-f631-5084-d4ce4a2f19ac,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Hospice care (regime/therapy),completed,,4f0a99a0-4087-ee29-3a84-ab2c24e142aa,1991-12-31T12:53:41+00:00
3749314,e8552e9c-e0aa-f631-5084-d4ce4a2f19ac,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Hospice care (regime/therapy),completed,,f3fb6aaa-bf7b-082f-7b58-ca4b1431e820,1992-04-28T16:53:41+00:00
3749315,e8552e9c-e0aa-f631-5084-d4ce4a2f19ac,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Hospice care (regime/therapy),completed,,117077fa-892f-d256-cae4-d6b392b24773,1992-09-01T16:53:41+00:00
3749316,e8552e9c-e0aa-f631-5084-d4ce4a2f19ac,ab7b50f7-e308-53e5-a1c2-2ad66e720595,Hospice care (regime/therapy),completed,,1ace25da-de64-a450-ed65-1c112df6845a,1992-09-08T16:53:41+00:00


In [59]:
df_payers

Unnamed: 0,patient_id,claim_id,payer_name,coverage_name,claim_total,claim_currency
0,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,f15bf7aa-e1f9-2f36-d577-103d1fef3727,NO_INSURANCE,NO_INSURANCE,704.20,USD
1,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,2593af82-3ffc-2d7e-3575-77b88efb06ba,Medicaid,Medicaid,704.20,USD
2,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,215bd8ef-0bba-05d4-13e5-4bf3042c136b,Medicaid,Medicaid,1005.38,USD
3,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,91c793df-1863-f997-488e-19e7094814eb,NO_INSURANCE,NO_INSURANCE,778.78,USD
4,9f23872b-9e3a-b3d2-1cab-0a3efe3db08b,f8bf7e60-0f8a-0fe3-3441-7a99166228af,NO_INSURANCE,NO_INSURANCE,652.95,USD
...,...,...,...,...,...,...
16375,ab7b50f7-e308-53e5-a1c2-2ad66e720595,062702ae-3ef3-4284-c3c1-a954e7916af3,Medicare,Medicare,20.63,USD
16376,ab7b50f7-e308-53e5-a1c2-2ad66e720595,2ab4880e-3953-5b0a-7f89-9bd3474b0e5f,Medicare,Medicare,91.72,USD
16377,ab7b50f7-e308-53e5-a1c2-2ad66e720595,c5f5abe2-7852-6e62-35b3-46891efb0c8b,Medicare,Medicare,4100.73,USD
16378,ab7b50f7-e308-53e5-a1c2-2ad66e720595,99c020a0-4d40-1fd5-8b08-201da295551a,Medicare,Medicare,5314.33,USD
