In [1]:
# @title Synthea Patient Generator (Fixed Version)
import os
from IPython.display import clear_output

# Configuration
num_patients = 5  # @param {type:"integer"}
state = "Massachusetts"  # @param ["Massachusetts", "California", "New York", "Texas", "Florida"]
age_range = "30-85"  # @param {type:"string"}
seed = 12345  # @param {type:"integer"}

# Install Java
!sudo apt-get update
!sudo apt-get install -y openjdk-11-jdk-headless
clear_output()
print("✅ Java installed")

# Download Synthea
!wget -q https://github.com/synthetichealth/synthea/releases/download/master-branch-latest/synthea-with-dependencies.jar
clear_output()
print("✅ Synthea downloaded")

# Generate patients (using proper string substitution)
!java -jar synthea-with-dependencies.jar \
  -p {num_patients} \
  -s {seed} \
  -a "{age_range}" \
  --exporter.baseDirectory "./output" \
  --exporter.fhir.export=False \
  --exporter.csv.export=True \
  {state}

# Verify output
if os.path.exists("./output/fhir"):
    print(f"\n🎉 Success! Generated {len(os.listdir('./output/fhir'))} FHIR records")
    !ls -lh "./output/fhir" | head -5
else:
    print("\n❌ Generation failed. Common fixes:")
    print("1. Try reducing patient count (start with 10)")
    print("2. Check Java version:")
    !java -version
    print("3. Disk space:")
    !df -h

✅ Synthea downloaded
SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.
Scanned 88 modules and 152 submodules.
Loading submodule modules/allergies/allergy_panel.json
Loading submodule modules/allergies/drug_allergy_incidence.json
Loading submodule modules/allergies/environmental_allergy_incidence.json
Loading submodule modules/allergies/food_allergy_incidence.json
Loading submodule modules/allergies/immunotherapy.json
Loading submodule modules/allergies/outgrow_env_allergies.json
Loading submodule modules/allergies/outgrow_food_allergies.json
Loading submodule modules/allergies/severe_allergic_reaction.json
Loading submodule modules/anemia/anemia_sub.json
Loading submodule modules/breast_cancer/chemotherapy_breast.json
Loading submodule modules/breast_cancer/hormone_diagnosis.json
Loading submodule modules/breast_cancer/hormonetherapy_breast.json
Loading submodul

In [4]:
import os
import glob
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, regexp_extract, when, lit, array_join, concat_ws
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType


data_path = "/content/output/fhir/*.json"

patients, encounters, procedures, conditions, medications, claims, observations, providers, organizations, payers = [], [], [], [], [], [], [], [], [], []

# Loop through each file
for filepath in glob.glob(data_path):
    with open(filepath, "r") as f:
        data = json.load(f)

    for entry in data["entry"]:
        resource = entry["resource"]
        res_type = resource["resourceType"]

        if res_type == "Patient":
            patients.append({
                "patient_id": resource.get("id"),
                "name": " ".join(resource["name"][0].get("given", [])) + " " + resource["name"][0].get("family", ""),
                "gender": resource.get("gender"),
                "birthDate": resource.get("birthDate"),
                "race": next((ext["extension"][1]["valueString"]
                              for ext in resource.get("extension", [])
                              if "us-core-race" in ext.get("url", "")), None),
                "ethnicity": next((ext["extension"][1]["valueString"]
                                   for ext in resource.get("extension", [])
                                   if "us-core-ethnicity" in ext.get("url", "")), None),
                "birthSex": next((ext.get("valueCode")
                                  for ext in resource.get("extension", [])
                                  if "us-core-birthsex" in ext.get("url", "")), None),
                "address": ", ".join(resource.get("address", [{}])[0].get("line", [])),
                "city": resource.get("address", [{}])[0].get("city", ""),
                "state": resource.get("address", [{}])[0].get("state", ""),
                "postalCode": resource.get("address", [{}])[0].get("postalCode", ""),
                "phone": next((tel["value"] for tel in resource.get("telecom", []) if tel["system"] == "phone"), None)
            })

        elif res_type == "Encounter":
            # Get service provider info
            service_provider_ref = resource.get("serviceProvider", {}).get("reference", "")
            org_id = None
            if "Organization/" in service_provider_ref:
                org_id = service_provider_ref.split("Organization/")[-1]
            elif "?" in service_provider_ref:  # Handle query format
                parts = service_provider_ref.split("|")
                if len(parts) > 1:
                    org_id = parts[-1]

            # Get participant info
            participant_info = {
                "provider_id": None,
                "provider_name": None,
                "provider_npi": None
            }

            participants = resource.get("participant", [])
            if participants:
                # Get first participant (typically the primary provider)
                participant = participants[0]
                individual_ref = participant.get("individual", {}).get("reference", "")
                provider_name = participant.get("individual", {}).get("display", "")

                # Extract provider ID and NPI
                if "Practitioner/" in individual_ref:
                    participant_info["provider_id"] = individual_ref.split("Practitioner/")[-1]
                elif "?" in individual_ref:  # Handle query format
                    parts = individual_ref.split("|")
                    if len(parts) > 1:
                        participant_info["provider_npi"] = parts[-1]
                        participant_info["provider_id"] = parts[-1]  # Use NPI as ID if available

                participant_info["provider_name"] = provider_name

            encounters.append({
                "encounter_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "org_id": org_id,
                "provider_id": participant_info["provider_id"],
                "provider_name": participant_info["provider_name"],
                "provider_npi": participant_info["provider_npi"],
                "status": resource.get("status"),
                "type": resource.get("type", [{}])[0].get("text"),
                "start": resource.get("period", {}).get("start"),
                "end": resource.get("period", {}).get("end"),
                "location": resource.get("location", [{}])[0].get("location", {}).get("display")
            })

        elif res_type == "Procedure":
            procedures.append({
                "procedure_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "procedure_code": resource.get("code", {}).get("text"),
                "status": resource.get("status"),
                "performed": resource.get("performedDateTime")
            })

        elif res_type == "Condition":
            conditions.append({
                "condition_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "condition_code": resource.get("code", {}).get("text"),
                "clinicalStatus": resource.get("clinicalStatus", {}).get("text"),
                "onsetDateTime": resource.get("onsetDateTime")
            })

        elif res_type == "MedicationRequest":
            medications.append({
                "med_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "medication": resource.get("medicationCodeableConcept", {}).get("text"),
                "status": resource.get("status"),
                "intent": resource.get("intent"),
                "authoredOn": resource.get("authoredOn")
            })

        elif res_type == "Claim":
            for item in resource.get("item", []):
                claims.append({
                    "claim_id": resource.get("id"),
                    "patient_id": resource.get("patient", {}).get("reference", "").split(":")[-1],
                    "productOrService": item.get("productOrService", {}).get("text"),
                    "amount": item.get("net", {}).get("value"),
                    "currency": item.get("net", {}).get("currency")
                })

        elif res_type == "Observation":
            observations.append({
                "observation_id": resource.get("id"),
                "patient_id": resource.get("subject", {}).get("reference", "").split(":")[-1],
                "observation_code": resource.get("code", {}).get("text"),
                "status": resource.get("status"),
                "value": resource.get("valueQuantity", {}).get("value"),
                "unit": resource.get("valueQuantity", {}).get("unit"),
                "effectiveDateTime": resource.get("effectiveDateTime")
            })

        elif res_type == "Organization":
            organizations.append({
                "organization_id": resource.get("id"),
                "org_name": resource.get("name"),
                "type": resource.get("type", [{}])[0].get("text", None),
                "address": ", ".join(resource.get("address", [{}])[0].get("line", [])),
                "city": resource.get("address", [{}])[0].get("city", ""),
                "state": resource.get("address", [{}])[0].get("state", ""),
                "postalCode": resource.get("address", [{}])[0].get("postalCode", "")[:5],
                "country": resource.get("address", [{}])[0].get("country", ""),
                "phone": next(
                    (telecom["value"] for telecom in resource.get("telecom", []) if telecom["system"] == "phone"), None
                )
            })

        elif res_type == "ExplanationOfBenefit":
            # Extract payer information
            patient_id = resource.get("patient", {}).get("reference", "").split(":")[-1]
            claim_id = resource.get("id")

            # Get insurer (payer) information
            insurer_display = resource.get("insurer", {}).get("display", "Unknown")

            # Get coverage information
            coverage_display = None
            insurance = resource.get("insurance", [])
            if insurance:
                coverage_display = insurance[0].get("coverage", {}).get("display", insurer_display)

            # Get total claim amount
            total_amount = None
            total = resource.get("total", [])
            if total:
                total_amount = total[0].get("amount", {}).get("value")

            payers.append({
                "patient_id": patient_id,
                "claim_id": claim_id,
                "payer_name": insurer_display,
                "coverage_name": coverage_display,
                "claim_total": total_amount,
                "claim_currency": "USD"  # Assuming USD as default
            })

# Convert lists to DataFrames
df_patients = pd.DataFrame(patients)
df_encounters = pd.DataFrame(encounters)
df_procedures = pd.DataFrame(procedures)
df_conditions = pd.DataFrame(conditions)
df_medications = pd.DataFrame(medications)
df_costs = pd.DataFrame(claims)
df_observations = pd.DataFrame(observations)
df_organizations = pd.DataFrame(organizations)
df_payers = pd.DataFrame(payers)

# Merge organization info with encounters
df_encounters = df_encounters.merge(
    df_organizations,
    left_on="org_id",
    right_on="organization_id",
    how="left",
    suffixes=('', '_org')
)

# Merge payer info with encounters
# First try to merge by both patient_id and claim_id (if claim_id is available)
if 'claim_id' in df_encounters.columns:
    df_encounters = df_encounters.merge(
        df_payers,
        on=["patient_id", "claim_id"],
        how="left"
    )
else:
    # Fallback: merge by patient_id only (most recent payer)
    df_recent_payers = df_payers.sort_values("claim_id").drop_duplicates("patient_id", keep="last")
    df_encounters = df_encounters.merge(
        df_recent_payers,
        on="patient_id",
        how="left"
    )

# Preview merged data
print("Patients:", df_patients.shape)
print("Encounters:", df_encounters.shape)
print("Medications:", df_medications.shape)
print("Conditions:", df_conditions.shape)
print("Procedures:", df_procedures.shape)
print("Observations:", df_observations.shape)
print("Costs:", df_costs.shape)
print("Organizations:", df_organizations.shape)
print("Payers:", df_payers.shape)

Patients: (7, 12)
Encounters: (286, 25)
Medications: (263, 6)
Conditions: (280, 5)
Procedures: (989, 5)
Observations: (2601, 7)
Costs: (2722, 5)
Organizations: (27, 9)
Payers: (549, 6)


In [17]:
df_procedures

Unnamed: 0,procedure_id,patient_id,procedure_code,status,performed
0,551eda1f-95b0-32ba-e093-84cb78314dd7,96665272-a2d8-a8ee-401f-1c25debfece7,Electroencephalogram (procedure),completed,
1,c6dc2c55-91b4-3e19-72e8-1b4040cbc1ed,96665272-a2d8-a8ee-401f-1c25debfece7,Medication reconciliation (procedure),completed,
2,ec4b4808-4a5f-7e0d-2bf2-cfd5044bd38b,96665272-a2d8-a8ee-401f-1c25debfece7,Assessment of health and social care needs (pr...,completed,
3,29d16a2b-b244-95ac-55ec-c7b462b8a0c5,96665272-a2d8-a8ee-401f-1c25debfece7,Medication reconciliation (procedure),completed,
4,c785b5e8-9e86-fd59-91bf-4c824c5e228e,96665272-a2d8-a8ee-401f-1c25debfece7,Sleep apnea assessment (procedure),completed,
...,...,...,...,...,...
984,77c1f573-defe-57e9-812e-34d65b147965,9df4460a-2f66-2d07-de9e-0afaf84bb157,Depression screening (procedure),completed,
985,7ac98171-e40e-ce64-8a55-509b32df5396,9df4460a-2f66-2d07-de9e-0afaf84bb157,Depression screening (procedure),completed,
986,977b14ec-8988-2a9e-9fb6-04077b91863a,9df4460a-2f66-2d07-de9e-0afaf84bb157,Assessment of substance use (procedure),completed,
987,c9ecb67e-8add-79af-464d-708fd33125c4,9df4460a-2f66-2d07-de9e-0afaf84bb157,Screening for drug abuse (procedure),completed,


In [21]:
#creating df above into spark dataframes...
spark = SparkSession.builder.appName("PandasToSpark").getOrCreate()

# Convert to Spark DataFrame
spark_df_encounters = spark.createDataFrame(df_encounters)
spark_df_patients = spark.createDataFrame(df_patients)

procedure_schema = StructType([
    StructField("procedure_id", StringType(), True),
    StructField("patient_id", StringType(), True),
    StructField("procedure_code", StringType(), True),
    StructField("status", StringType(), True),
    StructField("performed", StringType(), True) # Assuming 'performed' might have mixed types as well
])

spark_df_procedures = spark.createDataFrame(df_procedures,schema=procedure_schema)

condition_schema = StructType([
    StructField("condition_id", StringType(), True),
    StructField("patient_id", StringType(), True),
    StructField("condition_code", StringType(), True),
    StructField("clinicalStatus", StringType(), True),
    StructField("onsetDateTime", StringType(), True)
])
spark_df_conditions = spark.createDataFrame(df_conditions,schema=condition_schema)
spark_df_medications = spark.createDataFrame(df_medications)
spark_df_costs = spark.createDataFrame(df_costs)
spark_df_observations = spark.createDataFrame(df_observations)
spark_df_organizations = spark.createDataFrame(df_organizations)
spark_df_payers = spark.createDataFrame(df_payers)


In [8]:
#getting predicted cost based on org, insurance and procedure code

# Merge all relevant data
df_analysis = df_encounters.merge(
    df_patients[['patient_id', 'birthDate', 'gender', 'race', 'ethnicity']],
    on='patient_id',
    how='left'
)

# Merge with procedures
df_analysis = df_analysis.merge(
    df_procedures[['patient_id', 'procedure_code', 'status']],
    on=['patient_id'],
    how='left',
    suffixes=('', '_proc')
)

# Merge with conditions
df_analysis = df_analysis.merge(
    df_conditions[['patient_id', 'condition_code', 'clinicalStatus']],
    on='patient_id',
    how='left',
    suffixes=('', '_cond')
)

# Calculate patient age
df_analysis['birthDate'] = pd.to_datetime(df_analysis['birthDate'], utc=True) # make birthDate timezone aware
df_analysis['encounter_date'] = pd.to_datetime(df_analysis['start'], utc=True) # make encounter_date timezone aware
df_analysis['age'] = (df_analysis['encounter_date'] - df_analysis['birthDate']).dt.days / 365.25


# Clean cost data (assuming claim_total is our target variable)
df_analysis['claim_total'] = pd.to_numeric(df_analysis['claim_total'], errors='coerce')
df_analysis = df_analysis.dropna(subset=['claim_total'])

# Select relevant features
features = [
    'patient_id','age', 'gender', 'race', 'ethnicity',
    'type', 'provider_name', 'org_name',  # encounter type and hospital
    'payer_name', 'coverage_name',        # insurance info
    'procedure_code',                          # procedure code
    'condition_code',                          # condition code
    'claim_total'                         # target variable
]

df_model = df_analysis[features].copy()

In [None]:

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Preprocess categorical variables
categorical_features = ['gender', 'race', 'ethnicity', 'type', 'org_name',
                       'payer_name', 'coverage_name', 'procedure_code', 'condition_code']
numeric_features = ['age']

# Create transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Split data
X = df_model.drop('claim_total', axis=1)
y = df_model['claim_total']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train model
model.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
])

model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")

MAE: 0.00
MSE: 0.00
R2 Score: 1.00


In [None]:
# Create example data for prediction
example_data = pd.DataFrame({
    'age': [45],
    'gender': ['male'],
    'race': ['white'],
    'ethnicity': ['non-hispanic'],
    'type': ['emergency'],
    'org_name': ['General Hospital'],
    'payer_name': ['Aetna'],
    'coverage_name': ['Aetna'],
    'procedure_code': ['Dental care'],
    'condition_code': ['Fracture']
})

# Predict cost
predicted_cost = model.predict(example_data)
print(f"Predicted cost: ${predicted_cost[0]:.2f}")

Predicted cost: $413.71


In [14]:
# prompt: make df_model a spark dataframe

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PandasToSpark").getOrCreate()
df_model = spark.createDataFrame(df_model)


In [15]:
from pyspark.sql.functions import mean

# Group by hospital, insurance, and procedure code → compute average claim cost
cost_by_hospital_insurance = df_model.groupBy(
    "org_name",
    "payer_name",
    "procedure_code"
).agg(
    mean("claim_total").alias("avg_claim_total")  # Rename the aggregated column
)

# Show the results (similar to .head() in Pandas)
cost_by_hospital_insurance.show(5)

cost_by_hospital_insurance_pd = cost_by_hospital_insurance.toPandas()
cost_by_hospital_insurance_pd.head()
cost_by_hospital_insurance.orderBy("avg_claim_total", ascending=False).show(5)

+--------------------+----------+--------------------+------------------+
|            org_name|payer_name|      procedure_code|   avg_claim_total|
+--------------------+----------+--------------------+------------------+
|Fitchburg Outpati...|  Medicare|Removal of suprag...| 36.88317579250772|
|EXCEL HOME CARE S...|  Medicare|Medication reconc...|0.9099999999999742|
|EXCEL HOME CARE S...|  Medicare|Postoperative car...|0.9100000000000066|
|URGENT CARE SPECI...|  Medicare|Discussion about ...|0.9099999999999975|
|ENCOMPASS HEALTH ...|  Medicare|Restoration of to...|0.9100000000000221|
+--------------------+----------+--------------------+------------------+
only showing top 5 rows

+--------------------+-------------+--------------------+-----------------+
|            org_name|   payer_name|      procedure_code|  avg_claim_total|
+--------------------+-------------+--------------------+-----------------+
|BERKSHIRE MEDICAL...|Dual Eligible|Depression screen...|7932.570000000273|
|BERK

In [None]:
#most expensive procedures
expensive_procedures = df_model.groupby(['procedure_code', 'payer_name'])['claim_total'].mean().sort_values(ascending=False)

In [None]:
expensive_procedures.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,claim_total
procedure_code,payer_name,Unnamed: 2_level_1
Dental care (regime/therapy),NO_INSURANCE,934.74
Dental consultation and report (procedure),NO_INSURANCE,934.74
Oral health education (procedure),NO_INSURANCE,934.74
Patient referral for dental care (procedure),NO_INSURANCE,934.74
Removal of subgingival plaque and calculus from all teeth using dental instrument (procedure),NO_INSURANCE,934.74
