In [1]:
import os
import pandas as pd
import seaborn as sns

In [None]:
# !pip install pyspark

In [2]:
# standard libraries
import pandas as pd
import matplotlib.pyplot as plt

# PySpark libraries
from pyspark.sql import SparkSession
import pyspark.sql.functions as sql_f
from pyspark.sql.types import *
from pyspark.sql.functions import to_date, datediff, floor, col, avg, substring

# DL/ML libraries
from sklearn.metrics import accuracy_score
spark = SparkSession.builder.getOrCreate()


In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import unix_timestamp

for working in GCP

## 1) Loading the data

In [None]:
# @title Synthea Patient Generator (CSV Version)
import os
from IPython.display import clear_output

# Configuration
num_patients = 10  # @param {type:"integer"}
state = "Massachusetts"  # @param ["Massachusetts", "California", "New York", "Texas", "Florida"]
age_range = "30-85"  # @param {type:"string"}
seed = 12345  # @param {type:"integer"}

# Install Java
!sudo apt-get update
!sudo apt-get install -y openjdk-11-jdk-headless
clear_output()
print("Java installed!!!")

# Download Synthea
!wget -q https://github.com/synthetichealth/synthea/releases/download/master-branch-latest/synthea-with-dependencies.jar
clear_output()
print("Synthea downloaded!!!")

# Generate patients (using proper string substitution)
!java -jar synthea-with-dependencies.jar \
  -p {num_patients} \
  -s {seed} \
  -a "{age_range}" \
  --exporter.baseDirectory "./output" \
  --exporter.fhir.export=False \
  --exporter.csv.export=True \
  {state}

# Verify output
csv_output_path = "./output/csv"
if os.path.exists(csv_output_path):
    csv_files = [f for f in os.listdir(csv_output_path) if f.endswith('.csv')]
    if csv_files:
        print(f"\nSuccess! Generated {len(csv_files)} CSV files:")
        for file in csv_files[:5]:  # Show first 5 files
            print(f"- {file}")
        print(f"\nTotal records across all CSV files: {num_patients} patients")
    else:
        print("\n⚠ CSV directory exists but contains no CSV files")
else:
    print("\nGeneration failed. Common fixes:")
    print("1. Try reducing patient count (start with 10)")
    print("2. Check Java version:")
    !java -version
    print("3. Disk space:")
    !df -h
    

In [4]:
# @title Complete Synthea Batch Generator with Concatenation
import os
import pandas as pd
from IPython.display import clear_output
import time
import glob

# ===== Configuration =====
total_patients = 100  # @param {type:"integer"}
batch_size = 10  # @param {type:"integer"}
state = "Massachusetts"  # @param ["Massachusetts", "California", "New York", "Texas", "Florida"]
age_range = "30-85"  # @param {type:"string"}
base_seed = 12345  # @param {type:"integer"}
output_dir = "./output"  # @param {type:"string"}

# ===== Initial Setup =====
def setup_environment():
    """Install Java and download Synthea if not already present"""
    if not os.path.exists("/usr/bin/java"):
        print("Installing Java...")
        !sudo apt-get update -qq > /dev/null
        !sudo apt-get install -y openjdk-11-jdk-headless > /dev/null
        clear_output()
        print("✓ Java installed")
    
    if not os.path.exists("synthea-with-dependencies.jar"):
        print("Downloading Synthea...")
        !wget -q https://github.com/synthetichealth/synthea/releases/download/master-branch-latest/synthea-with-dependencies.jar
        clear_output()
        print("✓ Synthea downloaded")
    
    os.makedirs(output_dir, exist_ok=True)

# ===== Batch Generation =====
def generate_batch(batch_num, patients_in_batch, current_seed):
    """Generate one batch of synthetic patients"""
    try:
        print(f"Batch {batch_num}: Generating {patients_in_batch} patients (seed: {current_seed})")
        
        !java -jar synthea-with-dependencies.jar \
          -p {patients_in_batch} \
          -s {current_seed} \
          -a "{age_range}" \
          --exporter.baseDirectory "{output_dir}" \
          --exporter.fhir.export=False \
          --exporter.csv.export=True \
          --exporter.csv.folder_per_run=true \
          {state}
        
        return True
    except Exception as e:
        print(f"Error in batch {batch_num}: {str(e)}")
        return False

# ===== File Concatenation =====
def concatenate_all_csvs():
    """Combine all generated CSV files by their type"""
    csv_files = glob.glob(f"{output_dir}/**/*.csv", recursive=True)
    
    if not csv_files:
        print("No CSV files found to concatenate")
        return None
    
    # Dictionary to hold combined DataFrames
    combined_data = {}
    
    for filepath in csv_files:
        filename = os.path.basename(filepath)
        file_type = filename.split('.')[0]  # 'patients', 'encounters', etc.
        
        try:
            # Read current CSV
            df = pd.read_csv(filepath)
            
            # Combine with existing data for this type
            if file_type in combined_data:
                combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)
            else:
                combined_data[file_type] = df
                
        except Exception as e:
            print(f"⚠ Could not process {filename}: {str(e)}")
    
    # Save combined files
    combined_dir = f"{output_dir}/combined"
    os.makedirs(combined_dir, exist_ok=True)
    
    for file_type, df in combined_data.items():
        output_path = f"{combined_dir}/{file_type}.csv"
        df.to_csv(output_path, index=False)
        print(f"✓ Saved {len(df)} records to {output_path}")
    
    return combined_data

# ===== Main Execution =====
if __name__ == "__main__":
    # Setup environment
    setup_environment()
    
    # Batch processing
    start_time = time.time()
    completed = 0
    batch_num = 1
    
    print(f"Starting generation of {total_patients} patients in batches of {batch_size}...")
    
    while completed < total_patients:
        current_batch_size = min(batch_size, total_patients - completed)
        current_seed = base_seed + completed
        
        # Retry logic (3 attempts per batch)
        success = False
        for attempt in range(3):
            if generate_batch(batch_num, current_batch_size, current_seed):
                success = True
                break
            time.sleep(5)  # Wait before retry
        
        if success:
            completed += current_batch_size
            progress = completed / total_patients * 100
            print(f"Progress: {completed}/{total_patients} ({progress:.1f}%)")
            batch_num += 1
        else:
            print(f"❌ Failed batch {batch_num} after 3 attempts")
            break
    
    # Concatenate results
    print("\nCombining all CSV files...")
    combined_data = concatenate_all_csvs()
    
    # Final report
    elapsed = (time.time() - start_time) / 60
    print(f"\n{'='*40}")
    print(f"COMPLETED IN {elapsed:.1f} MINUTES")
    print(f"Total patients generated: {completed}/{total_patients}")
    
    if combined_data:
        print("\nCOMBINED FILES SUMMARY:")
        for file_type, df in combined_data.items():
            print(f"- {file_type}.csv: {len(df)} records")
    else:
        print("\nNo files were combined")

Starting generation of 100 patients in batches of 10...
Batch 1: Generating 10 patients (seed: 12345)
SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.
Scanned 88 modules and 152 submodules.
Loading submodule modules/allergies/allergy_panel.json
Loading submodule modules/allergies/drug_allergy_incidence.json
Loading submodule modules/allergies/environmental_allergy_incidence.json
Loading submodule modules/allergies/food_allergy_incidence.json
Loading submodule modules/allergies/immunotherapy.json
Loading submodule modules/allergies/outgrow_env_allergies.json
Loading submodule modules/allergies/outgrow_food_allergies.json
Loading submodule modules/allergies/severe_allergic_reaction.json
Loading submodule modules/anemia/anemia_sub.json
Loading submodule modules/breast_cancer/chemotherapy_breast.json
Loading submodule modules/breast_cancer/hormone_diagnosis.json
Lo

Loading Lookup Table: ace_arb_sacubitril_valsartan_product_distribution.csv
Loading Lookup Table: ace_arb_telmisartan_product_distribution.csv
Loading Lookup Table: ace_arb_valsartan_product_distribution.csv
Loading submodule modules/medications/beta_blocker.json
Loading Lookup Table: beta_blocker_ingredient_distribution.csv
Loading Lookup Table: beta_blocker_atenolol_product_distribution.csv
Loading Lookup Table: beta_blocker_bisoprolol_product_distribution.csv
Loading Lookup Table: beta_blocker_bisoprolol_hydrochlorothiazide_product_distribution.csv
Loading Lookup Table: beta_blocker_carvedilol_product_distribution.csv
Loading Lookup Table: beta_blocker_labetalol_product_distribution.csv
Loading Lookup Table: beta_blocker_metoprolol_product_distribution.csv
Loading Lookup Table: beta_blocker_nebivolol_product_distribution.csv
Loading Lookup Table: beta_blocker_propranolol_product_distribution.csv
Loading submodule modules/medications/ear_infection_antibiotic.json
Loading submodule mo

2 -- Rasheeda241 Mirian768 Dach178 (35 y/o F) Boston, Massachusetts  (50927)
1 -- Lorenzo669 Urrutia540 (67 y/o M) Chicopee, Massachusetts  (145814)
3 -- Jacque955 Jin479 Bailey598 (62 y/o F) Shrewsbury, Massachusetts DECEASED (99326)
3 -- Berneice173 Willia886 Rowe323 (68 y/o F) Shrewsbury, Massachusetts  (100084)
4 -- Keven605 Daugherty69 (50 y/o M) Dalton, Massachusetts  (73435)
6 -- Darrell400 Muller251 (50 y/o M) Lowell, Massachusetts  (69104)
7 -- Claudia969 Escobedo608 (35 y/o F) Lynn, Massachusetts  (51109)
5 -- Angela104 Funk324 (81 y/o F) South Yarmouth, Massachusetts  (118831)
9 -- Alfred550 West559 (31 y/o M) Belmont, Massachusetts  (46408)
8 -- Luther918 MacGyver246 (47 y/o M) Boston, Massachusetts DECEASED (66110)
10 -- Frederick289 Breitenberg711 (55 y/o M) Whitman, Massachusetts DECEASED (83355)
8 -- Donn979 Hackett68 (81 y/o M) Boston, Massachusetts  (114778)
10 -- Michale231 Lueilwitz711 (47 y/o M) Whitman, Massachusetts DECEASED (67513)
10 -- Genaro214 Murphy561 (73 

Loading submodule modules/hiv/hiv_baseline.json
Loading submodule modules/hiv/hiv_cd4.json
Loading Lookup Table: hiv_stage.csv
Loading submodule modules/hiv/hiv_oi_prophylaxis.json
Loading submodule modules/hiv/hiv_screening.json
Loading submodule modules/hiv/hiv_viral_load.json
Loading submodule modules/hiv/stop_all_art_meds.json
Loading submodule modules/injuries/broken_jaw.json
Loading submodule modules/lung_cancer/lung_cancer_probabilities.json
Progress: 20/100 (20.0%)
Batch 3: Generating 10 patients (seed: 12365)
SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.
Scanned 88 modules and 152 submodules.
Loading submodule modules/allergies/allergy_panel.json
Loading submodule modules/allergies/drug_allergy_incidence.json
Loading submodule modules/allergies/environmental_allergy_incidence.json
Loading submodule modules/allergies/food_allergy_incidence.json
Loadi

Loading Lookup Table: ace_arb_hydrochlorothiazide_lisinopril_product_distribution.csv
Loading Lookup Table: ace_arb_hydrochlorothiazide_losartan_product_distribution.csv
Loading Lookup Table: ace_arb_hydrochlorothiazide_valsartan_product_distribution.csv
Loading Lookup Table: ace_arb_irbesartan_product_distribution.csv
Loading Lookup Table: ace_arb_lisinopril_product_distribution.csv
Loading Lookup Table: ace_arb_losartan_product_distribution.csv
Loading Lookup Table: ace_arb_quinapril_product_distribution.csv
Loading Lookup Table: ace_arb_ramipril_product_distribution.csv
Loading Lookup Table: ace_arb_sacubitril_valsartan_product_distribution.csv
Loading Lookup Table: ace_arb_telmisartan_product_distribution.csv
Loading Lookup Table: ace_arb_valsartan_product_distribution.csv
Loading submodule modules/medications/beta_blocker.json
Loading Lookup Table: beta_blocker_ingredient_distribution.csv
Loading Lookup Table: beta_blocker_atenolol_product_distribution.csv
Loading Lookup Table: be

Loading module modules/vhd_aortic.json
Loading Lookup Table: vhd_as.csv
Loading Lookup Table: vhd_ar.csv
Loading module modules/vhd_mitral.json
Loading Lookup Table: vhd_mr.csv
Loading Lookup Table: vhd_ms.csv
Loading module modules/vhd_pulmonic.json
Loading Lookup Table: vhd_ps.csv
Loading Lookup Table: vhd_pr.csv
Loading module modules/vhd_tricuspid.json
Loading Lookup Table: vhd_tr.csv
Loading Lookup Table: vhd_ts.csv
Loading module modules/wellness_encounters.json
Running with options:
Population: 10
Seed: 12365
Provider Seed:1746371683018
Reference Time: 1746371683018
Location: Massachusetts
Min Age: 30
Max Age: 85
2 -- Felix524 Heller342 (57 y/o M) Abington, Massachusetts DECEASED (85708)
Progress: 30/100 (30.0%)
Batch 4: Generating 10 patients (seed: 12375)
SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.
Scanned 88 modules and 152 submodules.
Loading su

Loading submodule modules/medications/ace_arb.json
Loading Lookup Table: ace_arb_ingredient_distribution.csv
Loading Lookup Table: ace_arb_amlodipine_benazepril_product_distribution.csv
Loading Lookup Table: ace_arb_benazepril_product_distribution.csv
Loading Lookup Table: ace_arb_benazepril_hydrochlorothiazide_product_distribution.csv
Loading Lookup Table: ace_arb_enalapril_product_distribution.csv
Loading Lookup Table: ace_arb_hydrochlorothiazide_lisinopril_product_distribution.csv
Loading Lookup Table: ace_arb_hydrochlorothiazide_losartan_product_distribution.csv
Loading Lookup Table: ace_arb_hydrochlorothiazide_valsartan_product_distribution.csv
Loading Lookup Table: ace_arb_irbesartan_product_distribution.csv
Loading Lookup Table: ace_arb_lisinopril_product_distribution.csv
Loading Lookup Table: ace_arb_losartan_product_distribution.csv
Loading Lookup Table: ace_arb_quinapril_product_distribution.csv
Loading Lookup Table: ace_arb_ramipril_product_distribution.csv
Loading Lookup Ta

Loading module modules/veteran_hyperlipidemia.json
Loading module modules/veteran_lung_cancer.json
Loading module modules/veteran_mdd.json
Loading module modules/veteran_prostate_cancer.json
Loading module modules/veteran_ptsd.json
Loading module modules/veteran_self_harm.json
Loading module modules/veteran_substance_abuse_conditions.json
Loading module modules/veteran_substance_abuse_treatment.json
Loading module modules/vhd_aortic.json
Loading Lookup Table: vhd_as.csv
Loading Lookup Table: vhd_ar.csv
Loading module modules/vhd_mitral.json
Loading Lookup Table: vhd_mr.csv
Loading Lookup Table: vhd_ms.csv
Loading module modules/vhd_pulmonic.json
Loading Lookup Table: vhd_ps.csv
Loading Lookup Table: vhd_pr.csv
Loading module modules/vhd_tricuspid.json
Loading Lookup Table: vhd_tr.csv
Loading Lookup Table: vhd_ts.csv
Loading module modules/wellness_encounters.json
Running with options:
Population: 10
Seed: 12375
Provider Seed:1746371709452
Reference Time: 1746371709452
Location: Massach

Loading submodule modules/heart/operative_status.json
Loading submodule modules/heart/or_blood.json
Loading Lookup Table: or_blood_anemia_check.csv
Loading Lookup Table: or_blood_platelet_check.csv
Loading Lookup Table: or_blood_plasma_check.csv
Loading submodule modules/heart/savrepair/operative_status.json
Loading submodule modules/heart/savreplace/operative_status.json
Loading submodule modules/heart/stemi_fibrinolytic.json
Loading submodule modules/heart/stemi_pathway.json
Loading submodule modules/heart/tavr/alt_access.json
Loading submodule modules/heart/tavr/operation.json
Loading submodule modules/heart/tavr/operative_status.json
Loading submodule modules/heart/tavr/outcomes.json
Loading submodule modules/heart/tavr/postop.json
Loading submodule modules/heart/vhd_risks.json
Loading submodule modules/hiv/art_sequence.json
Loading submodule modules/hiv/art_sequence_1987_1994.json
Loading submodule modules/hiv/art_sequence_1995_1996.json
Loading submodule modules/hiv/art_sequence_

Loading Lookup Table: hiv_diagnosis_later.csv
Loading module modules/home_health_treatment.json
Loading module modules/home_hospice_snf.json
Loading module modules/homelessness.json
Loading module modules/hospice_treatment.json
Loading module modules/hypertension.json
Loading module modules/hypothyroidism.json
Loading module modules/injuries.json
Loading module modules/kidney_transplant.json
Loading module modules/lung_cancer.json
Loading module modules/lupus.json
Loading module modules/mTBI.json
Loading module modules/med_rec.json
Loading module modules/mend_program.json
Loading module modules/metabolic_syndrome_care.json
Loading module modules/metabolic_syndrome_disease.json
Loading module modules/myocardial_infarction.json
Loading module modules/opioid_addiction.json
Loading module modules/osteoarthritis.json
Loading module modules/osteoporosis.json
Loading module modules/pregnancy.json
Loading module modules/prescribing_opioids_for_chronic_pain_and_treatment_of_oud.json
Loading mod

Loading submodule modules/heart/avrr/sequence.json
Loading submodule modules/heart/cabg/cabg_referral.json
Loading submodule modules/heart/cabg/details.json
Loading Lookup Table: cabg_details_operative_approach.csv
Loading Lookup Table: cabg_details_num_grafts.csv
Loading Lookup Table: cabg_details_num_art_cond.csv
Loading submodule modules/heart/cabg/icu_meds_devices.json
Loading submodule modules/heart/cabg/labs_common.json
Loading submodule modules/heart/cabg/operation.json
Loading submodule modules/heart/cabg/or_intraop.json
Loading submodule modules/heart/cabg/or_labs_meds.json
Loading submodule modules/heart/cabg/outcomes.json
Loading submodule modules/heart/cabg/postop.json
Loading submodule modules/heart/cabg/postop_blood.json
Loading submodule modules/heart/cabg/preoperative.json
Loading submodule modules/heart/cabg_sequence.json
Loading submodule modules/heart/cardiac_labs.json
Loading submodule modules/heart/chf_lab_work.json
Loading submodule modules/heart/chf_lvad.json
Loa

Loading module modules/bone_marrow_transplant.json
Loading module modules/breast_cancer.json
Loading module modules/bronchitis.json
Loading module modules/cerebral_palsy.json
Loading module modules/chronic_kidney_disease.json
Loading module modules/colorectal_cancer.json
Loading module modules/congestive_heart_failure.json
Loading module modules/contraceptive_maintenance.json
Loading module modules/contraceptives.json
Loading module modules/copd.json
Loading module modules/covid19.json
Loading Lookup Table: covid19_prob.csv
Loading module modules/cystic_fibrosis.json
Loading module modules/dementia.json
Loading module modules/dental_and_oral_examination.json
Loading module modules/dentures.json
Loading module modules/dermatitis.json
Loading module modules/dialysis.json
Loading module modules/ear_infections.json
Loading module modules/epilepsy.json
Loading module modules/female_reproduction.json
Loading module modules/fibromyalgia.json
Loading module modules/food_allergies.json
Loading 

Loading submodule modules/dermatitis/mid_severe_eczema_obs.json
Loading submodule modules/dermatitis/moderate_cd_obs.json
Loading submodule modules/dermatitis/severe_cd_obs.json
Loading submodule modules/dme/wheelchair.json
Loading submodule modules/dme/wheelchair_end.json
Loading submodule modules/encounter/anxiety_screening.json
Loading submodule modules/encounter/depression_screening.json
Loading submodule modules/encounter/fall_risk_screening.json
Loading submodule modules/encounter/hark_screening.json
Loading submodule modules/encounter/hospital_basic_labs.json
Loading submodule modules/encounter/sdoh_hrsn.json
Loading submodule modules/encounter/substance_use_screening.json
Loading submodule modules/encounter/vitals.json
Loading submodule modules/heart/acs_anticoagulant.json
Loading submodule modules/heart/acs_antiplatelet.json
Loading submodule modules/heart/acs_arrival_medications.json
Loading submodule modules/heart/acs_discharge_meds.json
Loading submodule modules/heart/avrr/

Loading submodule modules/metabolic_syndrome/medications.json
Loading submodule modules/snf/skilled_nursing_facility.json
Loading submodule modules/surgery/general_anesthesia.json
Loading submodule modules/total_joint_replacement/functional_status_assessments.json
Loading submodule modules/uti/abx_tx.json
Loading submodule modules/uti/ambulatory_eval.json
Loading submodule modules/uti/ambulatory_path.json
Loading submodule modules/uti/ed_bundle.json
Loading submodule modules/uti/ed_eval.json
Loading submodule modules/uti/ed_path.json
Loading submodule modules/uti/gu_pregnancy_check.json
Loading submodule modules/uti/hpi.json
Loading submodule modules/uti/lab_follow_up.json
Loading submodule modules/uti/labs.json
Loading submodule modules/uti/telemed_path.json
Loading submodule modules/veterans/veteran_suicide_probabilities.json
Loading submodule modules/weight_loss/mend_week.json
Loading module modules/acute_myeloid_leukemia.json
Loading Lookup Table: AML.csv
Loading module modules/all

1 -- Selena146 Fanny438 Blanda868 (62 y/o F) North Attleborough, Massachusetts  (93226)
2 -- Alejandra902 Andrea7 Leal292 (78 y/o F) Uxbridge, Massachusetts  (111849)
4 -- Tierra831 Brown30 (36 y/o F) Weymouth, Massachusetts  (52347)
3 -- Ozie87 Ozie87 Homenick806 (41 y/o F) Leominster, Massachusetts  (59906)
6 -- Dong972 Douglas31 (66 y/o M) Weymouth, Massachusetts  (95347)
5 -- Eusebia552 Kirlin939 (45 y/o F) Worcester, Massachusetts DECEASED (101229)
7 -- Mac103 Thiel172 (40 y/o M) Westford, Massachusetts  (56065)
5 -- Alfreda3 Shaunna800 Veum823 (46 y/o F) Worcester, Massachusetts DECEASED (69802)
8 -- Demetrice140 Rebecca981 Bergnaum523 (76 y/o F) Melrose, Massachusetts  (115638)
5 -- Chasidy481 Hwa978 Wiza601 (61 y/o F) Worcester, Massachusetts  (97550)
9 -- Kerrie266 Mills423 (51 y/o F) Worcester, Massachusetts  (75891)
10 -- Trey250 White193 (35 y/o M) Wareham, Massachusetts  (50747)
Records: total=12, alive=10, dead=2
RNG=10
Clinician RNG=5641
Progress: 90/100 (90.0%)
Batch 10

Loading submodule modules/hiv/hiv_screening.json
Loading submodule modules/hiv/hiv_viral_load.json
Loading submodule modules/hiv/stop_all_art_meds.json
Loading submodule modules/injuries/broken_jaw.json
Loading submodule modules/lung_cancer/lung_cancer_probabilities.json
Loading submodule modules/medications/ace_arb.json
Loading Lookup Table: ace_arb_ingredient_distribution.csv
Loading Lookup Table: ace_arb_amlodipine_benazepril_product_distribution.csv
Loading Lookup Table: ace_arb_benazepril_product_distribution.csv
Loading Lookup Table: ace_arb_benazepril_hydrochlorothiazide_product_distribution.csv
Loading Lookup Table: ace_arb_enalapril_product_distribution.csv
Loading Lookup Table: ace_arb_hydrochlorothiazide_lisinopril_product_distribution.csv
Loading Lookup Table: ace_arb_hydrochlorothiazide_losartan_product_distribution.csv
Loading Lookup Table: ace_arb_hydrochlorothiazide_valsartan_product_distribution.csv
Loading Lookup Table: ace_arb_irbesartan_product_distribution.csv
Load

Loading module modules/stroke.json
Loading module modules/total_joint_replacement.json
Loading module modules/trigger_bone_marrow_transplant.json
Loading module modules/urinary_tract_infections.json
Loading Lookup Table: uti.csv
Loading Lookup Table: uti_recurrence.csv
Loading module modules/veteran.json
Loading module modules/veteran_hyperlipidemia.json
Loading module modules/veteran_lung_cancer.json
Loading module modules/veteran_mdd.json
Loading module modules/veteran_prostate_cancer.json
Loading module modules/veteran_ptsd.json
Loading module modules/veteran_self_harm.json
Loading module modules/veteran_substance_abuse_conditions.json
Loading module modules/veteran_substance_abuse_treatment.json
Loading module modules/vhd_aortic.json
Loading Lookup Table: vhd_as.csv
Loading Lookup Table: vhd_ar.csv
Loading module modules/vhd_mitral.json
Loading Lookup Table: vhd_mr.csv
Loading Lookup Table: vhd_ms.csv
Loading module modules/vhd_pulmonic.json
Loading Lookup Table: vhd_ps.csv
Loading

  df = pd.read_csv(filepath)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file
⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file
⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)
  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file
⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file
⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file
⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)
  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file


  combined_data[file_type] = pd.concat([combined_data[file_type], df], ignore_index=True)


⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file
⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file
⚠ Could not process providers.csv: No columns to parse from file
⚠ Could not process organizations.csv: No columns to parse from file
⚠ Could not process payers.csv: No columns to parse from file
✓ Saved 1079905 records to ./output/combined/observations.csv
✓ Saved 71423 records to ./output/combined/medications.csv
✓ Saved 7593 records to ./output/combined/devices.csv
✓ Saved 3310 records to ./output/combined/providers.csv
✓ Saved 57084 records to ./output/combined/imaging_studies.csv
✓ Saved 203346 records to ./output/combined/procedures.csv
✓ Saved 3310 records to ./output/combined/organizations.csv

In [5]:
# Create HDFS directory (with error suppression if it exists)
!hdfs dfs -mkdir -p /synthea_output 2> /dev/null

# Move combined files to HDFS (only the final combined CSVs, not the temp files)
!hdfs dfs -put -f ./output/combined/*.csv /synthea_output/

# Clean up local files (with confirmation)
!rm -rf ./output 2> /dev/null && echo "Local files cleaned up" || echo "Error cleaning local files"

Local files cleaned up


## 2) Creating the spark dataframes 

In [6]:
# Path to the Synthea CSVs on Hadoop
path = '/synthea_output/'


In [7]:
# Patient
observations = spark.read.csv(path+"observations.csv", header=True)
patient = spark.read.csv(path+"patients.csv", header=True) 

# Medical
careplans = spark.read.csv(path+"careplans.csv", header=True)
conditions = spark.read.csv(path+"conditions.csv", header=True)
procedures=spark.read.csv(path+"procedures.csv", header=True)
encounters = spark.read.csv(path+"encounters.csv", header=True)
medications = spark.read.csv(path+"medications.csv", header=True)

# Insurance and hospital
payer_transitions=spark.read.csv(path+"payer_transitions.csv", header=True)
payers=spark.read.csv(path+"payers.csv", header=True)
providers=spark.read.csv(path+"providers.csv", header=True)
organizations=spark.read.csv(path+"organizations.csv", header=True)


                                                                                

## 3) Cleaning dataframes and renaming variables 

In [8]:
# Renaming columns

patient = (
    patient.withColumnRenamed("Id", "patient_id")
           .withColumnRenamed("MARITAL", "patient_marital")
           .withColumnRenamed("RACE", "patient_race")
           .withColumnRenamed("ETHNICITY", "patient_ethnicity")
           .withColumnRenamed("GENDER", "patient_gender")
           .withColumnRenamed("ZIP", "patient_zip")
)

encounters = (
    encounters.withColumnRenamed("PATIENT", "patient_id")
              .withColumnRenamed("Id", "encounter_id")
              .withColumnRenamed("DESCRIPTION", "encounter_discription")
              .withColumnRenamed("CODE", "encounter_code")
              .withColumnRenamed("START", "encounter_start")
              .withColumn("encounter_start", to_date("encounter_start"))
              .withColumnRenamed("STOP", "encounter_stop")
              .withColumn("encounter_stop", to_date("encounter_stop"))
              .withColumn("PATIENT COST", col("TOTAL_CLAIM_COST") - col("PAYER_COVERAGE"))
              .withColumnRenamed("PAYER", "payer_id")
              .withColumnRenamed("ORGANIZATION", "organization_id")
              .withColumnRenamed("PROVIDER", "provider_id")
)

careplans = (
    careplans.withColumnRenamed("PATIENT", "patient_id")
             .withColumnRenamed("Id", "careplan_id")
             .withColumnRenamed("ENCOUNTER", "encounter_id")
             .withColumnRenamed("DESCRIPTION", "careplan_descriptions")
             .withColumnRenamed("CODE", "careplan_code")
)

procedures = (
    procedures.withColumnRenamed("PATIENT", "patient_id")
              .withColumnRenamed("ENCOUNTER", "encounter_id")
              .withColumnRenamed("DESCRIPTION", "procedure_descriptions")
              .withColumnRenamed("CODE", "procedure_code")
              .withColumnRenamed("DATE", "procedure_date")
              .withColumnRenamed("BASE_COST", "procedure_cost")
)

conditions = (
    conditions.withColumnRenamed("PATIENT", "patient_id")
              .withColumnRenamed("ENCOUNTER", "encounter_id")
              .withColumnRenamed("DESCRIPTION", "condition_description")
              .withColumnRenamed("CODE", "condition_code")
              .withColumnRenamed("START", "condition_start")
              .withColumnRenamed("END", "condition_end")
)

observations = (
    observations.withColumnRenamed("PATIENT", "patient_id")
                .withColumnRenamed("ENCOUNTER", "encounter_id")
                .withColumnRenamed("DATE", "observation_date")
                .withColumn("observation_date", to_date("observation_date"))
)

medications = (
    medications.withColumnRenamed("START", "medication_start")
               .withColumn("medication_start", to_date("medication_start"))
               .withColumnRenamed("STOP", "medication_stop")
               .withColumn("medication_stop", to_date("medication_stop"))
               .withColumnRenamed("PATIENT", "patient_id")
               .withColumnRenamed("PAYER", "payer_id")
               .withColumnRenamed("ENCOUNTER", "encounter_id")
               .withColumnRenamed("CODE", "medication_code")
               .withColumnRenamed("DESCRIPTION", "medication_description")
)

payer_transitions = (
    payer_transitions.withColumnRenamed("PATIENT", "patient_id")
                     .withColumnRenamed("PAYER", "payer_id")
)

payers = (
    payers.withColumnRenamed("Id", "payer_id")
          .withColumnRenamed("NAME", "payer_name")
          .withColumnRenamed("OWNERSHIP", "payer_ownership")
)

providers = (
    providers.withColumnRenamed("Id", "provider_id")
             .withColumnRenamed("SPECIALITY", "provider_specialty")
)

organizations = (
    organizations.withColumnRenamed("Id", "organization_id")
                 .withColumnRenamed("NAME", "organization_name")
                 .withColumnRenamed("ZIP", "organization_zip")
)

organizations = organizations.withColumn("organization_zip", 
                                      substring(col("organization_zip").cast("string"), 1, 5))

In [9]:
# Merge together dataframes on various id fields 
# The combined-encounters database will be the basis for our future analysis
encounters = (
    encounters
    .join(payers.select("payer_id", "payer_name", "payer_ownership"), on="payer_id", how="left")
    .join(organizations.select("organization_id", "organization_name", "organization_zip"), on="organization_id", how="left")
    .join(providers.select("provider_id", "provider_specialty"), on="provider_id", how="left")
    .join(procedures.select("encounter_id", "procedure_descriptions", "procedure_code"), on="encounter_id", how="left")
    .join(patient.select("patient_id", "BIRTHDATE", "patient_marital", "patient_race", "patient_ethnicity", "patient_gender", "patient_zip"), on="patient_id", how="left")
    .withColumn("age_at_encounter", floor(datediff(col("encounter_start"), col("BIRTHDATE")) / 365.25))
)


In [10]:
encounters.show(1)

25/05/04 15:30:29 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 16:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------------+-------------------+----------------+--------------+----------+-----------------+------------+----------+---------------+--------------------+----------------+------------------+----------------------+--------------+----------+---------------+------------+-----------------+--------------+-----------+----------------+
|          patient_id|        encounter_id|         provider_id|     organization_id|            payer_id|encounter_start|encounter_stop|ENCOUNTERCLASS|encounter_code|encounter_discription|BASE_ENCOUNTER_COST|TOTAL_CLAIM_COST|PAYER_COVERAGE|REASONCODE|REASONDESCRIPTION|PATIENT COST|payer_name|payer_ownership|   organization_name|organization_zip|provider_specialty|procedure_descriptions|procedure_code| BIRTHDATE|patient_marital|patient_race|patient_ethnicity|patient_gender|patient_zip|ag

                                                                                

In [15]:
encounters.columns

['patient_id',
 'encounter_id',
 'provider_id',
 'organization_id',
 'payer_id',
 'encounter_start',
 'encounter_stop',
 'ENCOUNTERCLASS',
 'encounter_code',
 'encounter_discription',
 'BASE_ENCOUNTER_COST',
 'TOTAL_CLAIM_COST',
 'PAYER_COVERAGE',
 'REASONCODE',
 'REASONDESCRIPTION',
 'PATIENT COST',
 'payer_name',
 'payer_ownership',
 'organization_name',
 'organization_zip',
 'provider_specialty',
 'procedure_descriptions',
 'procedure_code',
 'BIRTHDATE',
 'patient_marital',
 'patient_race',
 'patient_ethnicity',
 'patient_gender',
 'patient_zip',
 'age_at_encounter']



## Preparing Data

In [1]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Select relevant columns and cast numeric fields
modeling_df = encounters.select(
    col("PATIENT COST").cast("double").alias("label"),
    col("age_at_encounter").cast("double"),
    col("patient_marital"),
    col("patient_race"),
    col("patient_ethnicity"),
    col("patient_gender"),
    col("ENCOUNTERCLASS"),
    col("provider_specialty"),
    col("payer_ownership"),
    col("payer_name"),
    col("organization_zip"),
    col("organization_name"),
    col("procedure_code"),
    col("encounter_discription"),
    col("REASONDESCRIPTION")
).na.drop()  # Remove rows with null values

# Index categorical columns
categorical_cols = ['patient_marital', 'patient_race', 'patient_ethnicity', 
                   'patient_gender', 'ENCOUNTERCLASS', 'provider_specialty',
                   'payer_ownership',"payer_name","organization_name", 'organization_zip', 'procedure_code',
                    "encounter_discription", "REASONDESCRIPTION"]

indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep") 
            for col in categorical_cols]

# One-hot encode categorical columns
encoder = OneHotEncoder(
    inputCols=[col+"_index" for col in categorical_cols],
    outputCols=[col+"_encoded" for col in categorical_cols]
)

# Assemble all features into a single vector
numeric_cols = ['age_at_encounter']
assembler = VectorAssembler(
    inputCols=numeric_cols + [col+"_encoded" for col in categorical_cols],
    outputCol="features"
)

# Create pipeline for feature transformation
pipeline = Pipeline(stages=indexers + [encoder, assembler])
transformed_df = pipeline.fit(modeling_df).transform(modeling_df)

# Split data into training and test sets
train_data, test_data = transformed_df.randomSplit([0.8, 0.2], seed=42)

# OPTIMIZE PARTITIONS AND CACHE - PLACE HERE
train_data = train_data.repartition(100).cache()
test_data = test_data.repartition(50).cache()

# Verify caching worked (optional action to trigger persistence)
print(f"Training data count: {train_data.count()}")
print(f"Test data count: {test_data.count()}")


NameError: name 'encounters' is not defined

## Build Individual Models

In [None]:
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# 1. Random Forest
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="label",
    numTrees=100,
    maxDepth=5,
    seed=42
)
rf_model = rf.fit(train_data)

# 2. Gradient Boosted Trees
gbt = GBTRegressor(
    featuresCol="features",
    labelCol="label",
    maxIter=100,
    maxDepth=5,
    seed=42
)
gbt_model = gbt.fit(train_data)

# 3. Linear Regression
lr = LinearRegression(
    featuresCol="features",
    labelCol="label",
    regParam=0.3,
    elasticNetParam=0.8
)
lr_model = lr.fit(train_data)

## Create an Ensemble Model

In [None]:
from pyspark.sql.functions import lit
from pyspark.ml.regression import GBTRegressor

# Get predictions from each model
rf_predictions = rf_model.transform(test_data).select("label", col("prediction").alias("rf_pred"))
gbt_predictions = gbt_model.transform(test_data).select("label", col("prediction").alias("gbt_pred"))
lr_predictions = lr_model.transform(test_data).select("label", col("prediction").alias("lr_pred"))

# Join all predictions together
ensemble_data = rf_predictions.join(gbt_predictions, "label").join(lr_predictions, "label")

# Create features vector for the ensemble model
ensemble_assembler = VectorAssembler(
    inputCols=["rf_pred", "gbt_pred", "lr_pred"],
    outputCol="ensemble_features"
)
ensemble_data = ensemble_assembler.transform(ensemble_data)

# Train a meta-model (using GBT here)
meta_model = GBTRegressor(
    featuresCol="ensemble_features",
    labelCol="label",
    maxIter=100,
    maxDepth=3,
    seed=42
).fit(ensemble_data)

## Evaluate the Models

In [None]:
# Function to evaluate models
def evaluate_model(model, data, model_name):
    predictions = model.transform(data)
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")
    
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
    
    print(f"{model_name} Performance:")
    print(f"RMSE: {rmse}")
    print(f"R2: {r2}\n")

# Evaluate individual models
evaluate_model(rf_model, test_data, "Random Forest")
evaluate_model(gbt_model, test_data, "Gradient Boosted Trees")
evaluate_model(lr_model, test_data, "Linear Regression")

# Evaluate ensemble model
ensemble_predictions = meta_model.transform(ensemble_data)
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")
rmse = evaluator.evaluate(ensemble_predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(ensemble_predictions, {evaluator.metricName: "r2"})

print("Ensemble Model Performance:")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

## save and deploy models

In [None]:
# Save models for future use
model_path = "/models/patient_cost_prediction/"
rf_model.save(model_path + "random_forest")
gbt_model.save(model_path + "gradient_boosted_trees")
lr_model.save(model_path + "linear_regression")
meta_model.save(model_path + "ensemble_model")