In [1]:
import os
import pandas as pd
import time
import glob
from IPython.display import clear_output


from pyspark.sql import SparkSession
import pyspark.sql.functions as sql_f
from pyspark.sql.types import *
from pyspark.sql.functions import to_date, datediff, floor, col, avg, substring, when, length, lpad, monotonically_increasing_id, expr, unix_timestamp
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, VectorIndexer
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from concurrent.futures import ThreadPoolExecutor

spark = SparkSession.builder.getOrCreate()

In [2]:
!pip install pyspark



In [3]:
#Function to measure execution time...
def time_execution(task_name, func):
    start_time = time.time()
    result = func()
    end_time = time.time()
    duration = end_time - start_time
    print(f"{task_name} executed in {duration:.2f} seconds")
    return result, duration


In [9]:
# @title Synthea Patient Generator (CSV Version)
import os
from IPython.display import clear_output

#configuration
num_patients = 100  # @param {type:"integer"}
state = "Massachusetts"  # @param ["Massachusetts", "California", "New York", "Texas", "Florida"]
age_range = "30-85"  # @param {type:"string"}
seed = 12345  # @param {type:"integer"}

#install Java
!sudo apt-get update
!sudo apt-get install -y openjdk-11-jdk-headless
clear_output()
print("Java installed")

#download Synthea
!wget -q https://github.com/synthetichealth/synthea/releases/download/master-branch-latest/synthea-with-dependencies.jar
clear_output()
print("Synthea downloaded")

#generate patients (using proper string substitution)
!java -jar synthea-with-dependencies.jar \
  -p {num_patients} \
  -s {seed} \
  -a "{age_range}" \
  --exporter.baseDirectory "./output" \
  --exporter.fhir.export=False \
  --exporter.csv.export=True \
  {state}

#verify output
csv_output_path = "./output/csv"
if os.path.exists(csv_output_path):
    csv_files = [f for f in os.listdir(csv_output_path) if f.endswith('.csv')]
    if csv_files:
        print(f"\nSuccess! Generated {len(csv_files)} CSV files:")
        for file in csv_files[:5]:
            print(f"- {file}")
        print(f"\nTotal records across all CSV files: {num_patients} patients")
    else:
        print("\nCSV directory exists but contains no CSV files")
else:
    print("\nGeneration failed. Common fixes:")
    print("1. Try reducing patient count (start with 10)")
    print("2. Check Java version:")
    !java -version
    print("3. Disk space:")
    !df -h

Synthea downloaded
SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.
Scanned 88 modules and 152 submodules.
Loading submodule modules/allergies/allergy_panel.json
Loading submodule modules/allergies/drug_allergy_incidence.json
Loading submodule modules/allergies/environmental_allergy_incidence.json
Loading submodule modules/allergies/food_allergy_incidence.json
Loading submodule modules/allergies/immunotherapy.json
Loading submodule modules/allergies/outgrow_env_allergies.json
Loading submodule modules/allergies/outgrow_food_allergies.json
Loading submodule modules/allergies/severe_allergic_reaction.json
Loading submodule modules/anemia/anemia_sub.json
Loading submodule modules/breast_cancer/chemotherapy_breast.json
Loading submodule modules/breast_cancer/hormone_diagnosis.json
Loading submodule modules/breast_cancer/hormonetherapy_breast.json
Loading submodule 

## 1) Creating the spark dataframes

In [10]:
#setting path to HDFS folder
#path = '/synthea_output/'

# for using CPU in colab

path = '/content/output/csv/'

In [11]:
#Patient files
observations = spark.read.csv(path+"observations.csv", header=True)
patient = spark.read.csv(path+"patients.csv", header=True)
#Medical files
careplans = spark.read.csv(path+"careplans.csv", header=True)
conditions = spark.read.csv(path+"conditions.csv", header=True)
procedures=spark.read.csv(path+"procedures.csv", header=True)
encounters = spark.read.csv(path+"encounters.csv", header=True)
medications = spark.read.csv(path+"medications.csv", header=True)
#Insurance and hospital files
payer_transitions=spark.read.csv(path+"payer_transitions.csv", header=True)
payers=spark.read.csv(path+"payers.csv", header=True)
providers=spark.read.csv(path+"providers.csv", header=True)
organizations=spark.read.csv(path+"organizations.csv", header=True)

## 3) Cleaning dataframes and renaming variables

In [12]:
#Renaming columns
patient = (
    patient.withColumnRenamed("Id", "patient_id")
           .withColumnRenamed("MARITAL", "patient_marital")
           .withColumnRenamed("RACE", "patient_race")
           .withColumnRenamed("ETHNICITY", "patient_ethnicity")
           .withColumnRenamed("GENDER", "patient_gender")
           .withColumnRenamed("ZIP", "patient_zip")
)
encounters = (
    encounters.withColumnRenamed("PATIENT", "patient_id")
              .withColumnRenamed("Id", "encounter_id")
              .withColumnRenamed("DESCRIPTION", "encounter_discription")
              .withColumnRenamed("CODE", "encounter_code")
              .withColumnRenamed("START", "encounter_start")
              .withColumn("encounter_start", to_date("encounter_start"))
              .withColumnRenamed("STOP", "encounter_stop")
              .withColumn("encounter_stop", to_date("encounter_stop"))
              .withColumn("PATIENT COST", col("TOTAL_CLAIM_COST") - col("PAYER_COVERAGE"))
              .withColumnRenamed("PAYER", "payer_id")
              .withColumnRenamed("ORGANIZATION", "organization_id")
              .withColumnRenamed("PROVIDER", "provider_id")
)
careplans = (
    careplans.withColumnRenamed("PATIENT", "patient_id")
             .withColumnRenamed("Id", "careplan_id")
             .withColumnRenamed("ENCOUNTER", "encounter_id")
             .withColumnRenamed("DESCRIPTION", "careplan_descriptions")
             .withColumnRenamed("CODE", "careplan_code")
)
procedures = (
    procedures.withColumnRenamed("PATIENT", "patient_id")
              .withColumnRenamed("ENCOUNTER", "encounter_id")
              .withColumnRenamed("DESCRIPTION", "procedure_descriptions")
              .withColumnRenamed("CODE", "procedure_code")
              .withColumnRenamed("DATE", "procedure_date")
              .withColumnRenamed("BASE_COST", "procedure_cost")
)
conditions = (
    conditions.withColumnRenamed("PATIENT", "patient_id")
              .withColumnRenamed("ENCOUNTER", "encounter_id")
              .withColumnRenamed("DESCRIPTION", "condition_description")
              .withColumnRenamed("CODE", "condition_code")
              .withColumnRenamed("START", "condition_start")
              .withColumnRenamed("END", "condition_end")
)
observations = (
    observations.withColumnRenamed("PATIENT", "patient_id")
                .withColumnRenamed("ENCOUNTER", "encounter_id")
                .withColumnRenamed("DATE", "observation_date")
                .withColumn("observation_date", to_date("observation_date"))
                .withColumn("obs_value", col("VALUE").cast("double"))
                .withColumnRenamed("CODE", "observation_code")
                .withColumnRenamed("DESCRIPTION", "observation_description")
)
medications = (
    medications.withColumnRenamed("START", "medication_start")
               .withColumn("medication_start", to_date("medication_start"))
               .withColumnRenamed("STOP", "medication_stop")
               .withColumn("medication_stop", to_date("medication_stop"))
               .withColumnRenamed("PATIENT", "patient_id")
               .withColumnRenamed("PAYER", "payer_id")
               .withColumnRenamed("ENCOUNTER", "encounter_id")
               .withColumnRenamed("CODE", "medication_code")
               .withColumnRenamed("DESCRIPTION", "medication_description")
)
payer_transitions = (
    payer_transitions.withColumnRenamed("PATIENT", "patient_id")
                     .withColumnRenamed("PAYER", "payer_id")
)
payers = (
    payers.withColumnRenamed("Id", "payer_id")
          .withColumnRenamed("NAME", "payer_name")
          .withColumnRenamed("OWNERSHIP", "payer_ownership")
)
providers = (
    providers.withColumnRenamed("Id", "provider_id")
             .withColumnRenamed("SPECIALITY", "provider_specialty")
)
organizations = (
    organizations.withColumnRenamed("Id", "organization_id")
                 .withColumnRenamed("NAME", "organization_name")
                 .withColumnRenamed("ZIP", "organization_zip")
)
organizations = organizations.withColumn(
    "organization_zip",
    col("organization_zip").cast("string")
)

#adding leading 0's to zip codes to retain their information
organizations = organizations.withColumn(
    "organization_zip",
    when(length(col("organization_zip")) == 4,
    lpad(col("organization_zip"), 5, "0")
).otherwise(col("organization_zip")))

In [13]:
#Merge together dataframes on various id fields that will be used for ML modeling...
encounters = (
    encounters
    .join(payers.select("payer_id", "payer_name", "payer_ownership"), on="payer_id", how="left")
    .join(organizations.select("organization_id", "organization_name", "organization_zip"), on="organization_id", how="left")
    .join(providers.select("provider_id", "provider_specialty"), on="provider_id", how="left")
    .join(procedures.select("encounter_id", "procedure_descriptions", "procedure_code"), on="encounter_id", how="left")
    .join(patient.select("patient_id", "BIRTHDATE", "patient_marital", "patient_race", "patient_ethnicity", "patient_gender", "patient_zip"), on="patient_id", how="left")
    .withColumn("age_at_encounter", floor(datediff(col("encounter_start"), col("BIRTHDATE")) / 365.25))
)


In [14]:
encounters.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------------+-------------------+----------------+--------------+----------+--------------------+------------------+------------+---------------+--------------------+----------------+------------------+----------------------+--------------+----------+---------------+------------+-----------------+--------------+-----------+----------------+
|          patient_id|        encounter_id|         provider_id|     organization_id|            payer_id|encounter_start|encounter_stop|ENCOUNTERCLASS|encounter_code|encounter_discription|BASE_ENCOUNTER_COST|TOTAL_CLAIM_COST|PAYER_COVERAGE|REASONCODE|   REASONDESCRIPTION|      PATIENT COST|  payer_name|payer_ownership|   organization_name|organization_zip|provider_specialty|procedure_descriptions|procedure_code| BIRTHDATE|patient_marital|patient_race|patient_ethnicity|patient

In [15]:
encounters.columns

['patient_id',
 'encounter_id',
 'provider_id',
 'organization_id',
 'payer_id',
 'encounter_start',
 'encounter_stop',
 'ENCOUNTERCLASS',
 'encounter_code',
 'encounter_discription',
 'BASE_ENCOUNTER_COST',
 'TOTAL_CLAIM_COST',
 'PAYER_COVERAGE',
 'REASONCODE',
 'REASONDESCRIPTION',
 'PATIENT COST',
 'payer_name',
 'payer_ownership',
 'organization_name',
 'organization_zip',
 'provider_specialty',
 'procedure_descriptions',
 'procedure_code',
 'BIRTHDATE',
 'patient_marital',
 'patient_race',
 'patient_ethnicity',
 'patient_gender',
 'patient_zip',
 'age_at_encounter']

In [21]:
patient.count()

123

In [22]:
encounters.count()

27607

## Preparing Data

In [17]:
modeling_df = encounters.select(
    col("PATIENT COST").cast("double").alias("label"),
    col("age_at_encounter").cast("double"),
    col("patient_marital"),
    col("patient_race"),
    col("patient_ethnicity"),
    col("patient_gender"),
    col("ENCOUNTERCLASS"),
    col("payer_ownership"),
    col("payer_name"),
    col("organization_zip"),
    col("organization_name"),
    col("procedure_code"),
    #col("encounter_discription"), # don't use for RF, LR and GBT models
    col("encounter_code"),
    #col("REASONDESCRIPTION"), # don't use for RF, LR and GBT models
).na.drop().filter(col("PATIENT COST") != 0)


#define categorical and numeric columns
categorical_cols = ['patient_marital', 'patient_race', 'patient_ethnicity',
                   'patient_gender', 'ENCOUNTERCLASS',
                   'payer_ownership',"payer_name","organization_name", "organization_zip", 'procedure_code',"encounter_code"]
numeric_cols = ['age_at_encounter']

target_col = "label"

#create feature engineering pipeline stages
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
            for col in categorical_cols]

encoder = OneHotEncoder(
    inputCols=[col+"_index" for col in categorical_cols],
    outputCols=[col+"_encoded" for col in categorical_cols],
    dropLast=True
)

assembler = VectorAssembler(
    inputCols=numeric_cols + [col+"_encoded" for col in categorical_cols],
    outputCol="features"
)

#cache after feature engineering
feature_pipeline = Pipeline(stages=indexers + [encoder, assembler])
feature_model = feature_pipeline.fit(modeling_df)
feature_df = feature_model.transform(modeling_df).select("features", target_col).cache()

In [19]:
#Splitting training and test dataframes nd present the number of partitions of the training dataframe
train_df, test_df = time_execution(
    "Data splitting",
    lambda: feature_df.randomSplit([0.8, 0.2], seed=53)
)[0]
train_df.rdd.getNumPartitions()

Data splitting executed in 0.04 seconds


1

In [30]:
#repartition data to distribute it across the cluster - only in GCP
train_df = train_df.repartition(12)
print(f"Training Data Partitions: {train_df.rdd.getNumPartitions()}")

Training Data Partitions: 12


## Random Forest Model

In [31]:
#Random Forest
#common evaluator for regression tasks
reg_evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="label",
    numTrees=30,  # Reduced for initial testing
    maxDepth=10,
    subsamplingRate=0.7,
    featureSubsetStrategy='sqrt',
    seed=42
)

rf_model, rf_time = time_execution(
    "Random Forest training",
    lambda: rf.fit(train_df)
)

rf_predictions = rf_model.transform(test_df)


print("Random Forest Regression Results:")
print("RMSE:", reg_evaluator.evaluate(rf_predictions, {reg_evaluator.metricName: "rmse"}))
print("R2:", reg_evaluator.evaluate(rf_predictions, {reg_evaluator.metricName: "r2"}))

Random Forest training executed in 19.10 seconds
Random Forest Regression Results:
RMSE: 2065.6919852673072
R2: 0.5606392676844362


In [32]:
rf_predictions.show(20)

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|(772,[0,1,5,10,12...|            159.11|1056.6596846971015|
|(772,[0,1,5,10,12...|            366.88|1056.6596846971015|
|(772,[0,1,5,10,12...|            366.88| 1075.774969015747|
|(772,[0,1,5,10,12...| 966.1899999999996| 951.4431662587205|
|(772,[0,1,5,10,12...| 966.1899999999996| 951.4431662587205|
|(772,[0,1,5,10,12...| 621.0699999999997| 951.4431662587205|
|(772,[0,1,5,10,12...| 621.0699999999997| 972.0125721108092|
|(772,[0,1,5,10,12...| 966.1899999999996| 951.4431662587205|
|(772,[0,1,5,10,12...|1214.0899999999997| 972.0125721108092|
|(772,[0,1,5,10,12...|           3785.16|1014.2827314984464|
|(772,[0,1,5,10,12...| 966.1899999999996| 951.4431662587205|
|(772,[0,1,5,10,12...|             35.78| 951.4431662587205|
|(772,[0,1,5,10,12...| 707.3499999999999| 951.4431662587205|
|(772,[0,1,5,10,12...|  

## Linear Regression Model

In [33]:
#Linear Regression
lr = LinearRegression(
    featuresCol="features",
    labelCol="label",
    regParam=0.01,
    elasticNetParam=0.5
)

lr_model, lr_time = time_execution(
    "Linear Regression training",
    lambda: lr.fit(train_df)
)

lr_predictions = lr_model.transform(test_df)

print("\nLinear Regression Results:")
print("RMSE:", reg_evaluator.evaluate(lr_predictions, {reg_evaluator.metricName: "rmse"}))
print("R2:", reg_evaluator.evaluate(lr_predictions, {reg_evaluator.metricName: "r2"}))


Linear Regression training executed in 4.25 seconds

Linear Regression Results:
RMSE: 1376.1755848398639
R2: 0.8049985648240643


In [34]:
lr_predictions.show(20)

+--------------------+------------------+-------------------+
|            features|             label|         prediction|
+--------------------+------------------+-------------------+
|(772,[0,1,5,10,12...|            159.11|  1053.190380140531|
|(772,[0,1,5,10,12...|            366.88| 1057.2223397885036|
|(772,[0,1,5,10,12...|            366.88|  867.9964205616278|
|(772,[0,1,5,10,12...| 966.1899999999996| 1170.3497475843376|
|(772,[0,1,5,10,12...| 966.1899999999996| 1133.5501313059895|
|(772,[0,1,5,10,12...| 621.0699999999997| 1139.5929206671872|
|(772,[0,1,5,10,12...| 621.0699999999997| 1219.4545367742026|
|(772,[0,1,5,10,12...| 966.1899999999996| 1313.5499807117208|
|(772,[0,1,5,10,12...|1214.0899999999997| 3546.4659981682603|
|(772,[0,1,5,10,12...|           3785.16|  3556.545897288191|
|(772,[0,1,5,10,12...| 966.1899999999996| 1262.1701812251067|
|(772,[0,1,5,10,12...|             35.78|-331.25411736921114|
|(772,[0,1,5,10,12...| 707.3499999999999|  791.8390873772562|
|(772,[0

## Gradient Boosted Tree Model

In [35]:
#Gradient Boosted Trees Regressor
gbt = GBTRegressor(
    featuresCol="features",
    labelCol="label",
    maxIter=20,  # Reduced iterations
    maxDepth=10,
    stepSize=0.1,
    subsamplingRate=0.7,
    seed=42
)

gbt_model, gbt_time = time_execution(
    "GBT training",
    lambda: gbt.fit(train_df)
)

gbt_predictions = gbt_model.transform(test_df)

print("\nGBT Regression Results:")
print("RMSE:", reg_evaluator.evaluate(gbt_predictions, {reg_evaluator.metricName: "rmse"}))
print("R2:", reg_evaluator.evaluate(gbt_predictions, {reg_evaluator.metricName: "r2"}))

GBT training executed in 166.57 seconds

GBT Regression Results:
RMSE: 753.9394982940056
R2: 0.9414720782767539


In [36]:
gbt_predictions.show(20)

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|(772,[0,1,5,10,12...|            159.11|308.72660710057716|
|(772,[0,1,5,10,12...|            366.88|308.72660710057716|
|(772,[0,1,5,10,12...|            366.88|308.72660710057716|
|(772,[0,1,5,10,12...| 966.1899999999996| 716.7576718266738|
|(772,[0,1,5,10,12...| 966.1899999999996| 716.7576718266738|
|(772,[0,1,5,10,12...| 621.0699999999997| 671.7438589028733|
|(772,[0,1,5,10,12...| 621.0699999999997| 670.0519120891942|
|(772,[0,1,5,10,12...| 966.1899999999996| 716.7576718266738|
|(772,[0,1,5,10,12...|1214.0899999999997| 2353.115421521892|
|(772,[0,1,5,10,12...|           3785.16| 2582.363173891127|
|(772,[0,1,5,10,12...| 966.1899999999996| 716.7576718266738|
|(772,[0,1,5,10,12...|             35.78| 353.3470183047824|
|(772,[0,1,5,10,12...| 707.3499999999999| 675.9594316838468|
|(772,[0,1,5,10,12...|  

## Ensemble of ML Models

In [39]:
# Start total execution timer
total_start = time.time()

# Split the training data into 3 parts for the 3 models
split_start = time.time()
rf_df, lr_df, gbt_df = train_df.randomSplit([1.0, 1.0, 1.0], seed=42)
split_time = time.time() - split_start
print(f"Data splitting completed in {split_time:.2f} seconds")

# Define model training functions
def train_rf():
    start = time.time()
    rf = RandomForestRegressor(featuresCol="features", labelCol="label", numTrees=30, maxDepth=10, seed=42)
    model = rf.fit(rf_df)
    print(f"Random Forest trained in {time.time() - start:.2f} seconds")
    return model

def train_lr():
    start = time.time()
    lr = LinearRegression(featuresCol="features", labelCol="label", regParam=0.01, elasticNetParam=0.5)
    model = lr.fit(lr_df)
    print(f"Linear Regression trained in {time.time() - start:.2f} seconds")
    return model

def train_gbt():
    start = time.time()
    gbt = GBTRegressor(featuresCol="features", labelCol="label", maxIter=10, maxDepth=5, stepSize=0.1, seed=42)
    model = gbt.fit(gbt_df)
    print(f"GBT trained in {time.time() - start:.2f} seconds")
    return model

# Train models in parallel with timing
train_start = time.time()
with ThreadPoolExecutor() as executor:
    rf_future = executor.submit(train_rf)
    lr_future = executor.submit(train_lr)
    gbt_future = executor.submit(train_gbt)

    rf_model = rf_future.result()
    lr_model = lr_future.result()
    gbt_model = gbt_future.result()
train_time = time.time() - train_start
print(f"Total model training time (parallel): {train_time:.2f} seconds")

# Make predictions and add row indices
pred_start = time.time()
rf_pred = rf_model.transform(test_df).select("prediction", "label") \
    .withColumnRenamed("prediction", "rf_pred") \
    .withColumn("row_idx", monotonically_increasing_id())

lr_pred = lr_model.transform(test_df).select("prediction") \
    .withColumnRenamed("prediction", "lr_pred") \
    .withColumn("row_idx", monotonically_increasing_id())

gbt_pred = gbt_model.transform(test_df).select("prediction") \
    .withColumnRenamed("prediction", "gbt_pred") \
    .withColumn("row_idx", monotonically_increasing_id())
pred_time = time.time() - pred_start
print(f"Prediction generation completed in {pred_time:.2f} seconds")

# Join predictions on row index
join_start = time.time()
ensemble_df = rf_pred \
    .join(lr_pred, on="row_idx") \
    .join(gbt_pred, on="row_idx") \
    .withColumn("ensemble_prediction", expr("(rf_pred + lr_pred + gbt_pred)/3"))
join_time = time.time() - join_start
print(f"Prediction joining completed in {join_time:.2f} seconds")

# Evaluate ensemble
eval_start = time.time()
reg_evaluator = RegressionEvaluator(labelCol="label", predictionCol="ensemble_prediction")

print("\nEnsemble Results:")
print("RMSE:", reg_evaluator.evaluate(ensemble_df, {reg_evaluator.metricName: "rmse"}))
print("R2:", reg_evaluator.evaluate(ensemble_df, {reg_evaluator.metricName: "r2"}))
eval_time = time.time() - eval_start
print(f"Evaluation completed in {eval_time:.2f} seconds")

# Total execution time
total_time = time.time() - total_start
print(f"\nTotal execution time: {total_time:.2f} seconds")

Data splitting completed in 0.03 seconds
Linear Regression trained in 5.95 seconds
Random Forest trained in 32.87 seconds
GBT trained in 51.32 seconds
Total model training time (parallel): 51.32 seconds
Prediction generation completed in 0.45 seconds
Prediction joining completed in 0.35 seconds

Ensemble Results:
RMSE: 1017.64950697458
R2: 0.8933682434264739
Evaluation completed in 1.27 seconds

Total execution time: 53.42 seconds


In [40]:
ensemble_df.show(20)

+-------+-----------------+------------------+------------------+------------------+-------------------+
|row_idx|          rf_pred|             label|           lr_pred|          gbt_pred|ensemble_prediction|
+-------+-----------------+------------------+------------------+------------------+-------------------+
|      0|549.1465514190207|            159.11|1459.7899504012769| 641.1343049640785|   883.356935594792|
|      1|549.1465514190207|            366.88|1472.2208120749153| 641.1343049640785|  887.5005561526715|
|      2|549.1465514190207|            366.88| 975.0567226156006| 565.7872453552908|  696.6635064633041|
|      3|468.1070799523851| 966.1899999999996|1300.3471160425865| 609.6135401343477|  792.6892453764398|
|      4|468.1070799523851| 966.1899999999996|1219.3500321736951| 609.6135401343477|  765.6902174201426|
|      5|468.1070799523851| 621.0699999999997|1205.7156134439253| 609.6135401343477|  761.1454111768861|
|      6|468.1070799523851| 621.0699999999997|1415.0543