In [1]:
import time

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, desc, rank, sum as _sum
from pyspark.sql.types import *
from pyspark.sql.window import Window

from sedona.spark import *

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1460,application_1732639283265_1421,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Case No1

2 executors
8g memory
4 core

In [2]:
# Start Spark Session
spark = SparkSession.builder \
    .appName("Query 5") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# paths
crime_data_2010_2019 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_2020 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
police_stations = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

# Load datasets
crime_df = spark.read.csv(crime_data_2010_2019, header=True, inferSchema=True).union(
    spark.read.csv(crime_data_2020, header=True, inferSchema=True))
police_stations_df = spark.read.csv(police_stations, header=True, inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
start_time_1 = time.time()

# Create sedona context
sedona = SedonaContext.create(spark)

# Create geometry column for crimes
crime_df = crime_df.withColumn("geom", ST_Point("LON", "LAT"))

# Create geometry column for police stations
police_stations_df = police_stations_df.withColumn("geom", ST_Point("X", "Y"))

# Show updated schemas
crime_df.printSchema()
police_stations_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer (nullable = true)
 |-- 

In [5]:
# Cross-join crimes and police stations to calculate distances
crime_distances_df = crime_df.crossJoin(police_stations_df) \
    .withColumn("distance", ST_DistanceSphere(crime_df["geom"], police_stations_df["geom"]))

# Show the calculated distances
crime_distances_df.show(4)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------+--------+--------+------------+---------+--------------------+--------------+-----------+------+------------+--------+--------+--------+--------+--------------------+------------+-------+---------+--------------------+--------------+----------------+---+-----------+--------------------+----+--------------------+------------------+
|  DR_NO|           Date Rptd|            DATE OCC|TIME OCC|AREA |AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|       Mocodes|Vict Age|Vict Sex|Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|Weapon Desc|Status| Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Street|    LAT|      LON|                geom|             X|               Y|FID|   DIVISION|            LOCATION|PREC|                geom|          distance|
+-------+--------------------+--------------------+---

In [6]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# Create a window partitioned by crime DR_NO, ordered by distance
window_spec = Window.partitionBy("DR_NO").orderBy("distance")

# Select the closest police station for each crime
closest_station_df = crime_distances_df.withColumn("rank", row_number().over(window_spec)) \
    .filter(col("rank") == 1) \
    .select("DR_NO", "DIVISION", "distance")

# Show the closest police stations
closest_station_df.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+------------------+
|    DR_NO| DIVISION|          distance|
+---------+---------+------------------+
| 91231932|SOUTHWEST|2257.2533220281193|
|100100523|  CENTRAL| 1300.733040644066|
|100100608|  RAMPART|1140.0611312527299|
|100100751|  CENTRAL|221.00932828621197|
|100100853|  CENTRAL|1089.0413188582168|
|100100857|  CENTRAL| 698.8861729382769|
|100100925|  CENTRAL| 271.6366601173468|
|100100950|  CENTRAL| 521.3846970614009|
|100100980|  CENTRAL| 724.6837344896162|
|100101125|  CENTRAL| 674.4711458026111|
|100101136|  CENTRAL|1983.2012674561922|
|100101283|  CENTRAL| 313.3676894974406|
|100101422|  CENTRAL| 911.6183493737415|
|100101429|  CENTRAL| 552.5738180020111|
|100101685|  CENTRAL|1721.0833118567502|
|100104055|  CENTRAL|431.77897375232095|
|100104099|  CENTRAL| 271.6366601173468|
|100104146|  CENTRAL| 526.1458976350942|
|100104254|  CENTRAL| 271.6366601173468|
|100104272|  CENTRAL| 886.7059823960111|
+---------+---------+------------------+
only showing top

In [7]:
from pyspark.sql.functions import avg, count, col

# Aggregate crimes and distances by police station
station_crime_stats_df = closest_station_df.groupBy("DIVISION") \
    .agg(
        count("DR_NO").alias("Number_of_Crimes"),
        avg("distance").alias("Average_Distance")
    ) \
    .orderBy(col("Number_of_Crimes").desc())

# Reorder the columns
station_crime_stats_df = station_crime_stats_df.select(
    "DIVISION",
    "Average_Distance",
    "Number_of_Crimes"
)

# Show the final results
station_crime_stats_df.show()

end_time_1 = time.time()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----------------+
|        DIVISION|  Average_Distance|Number_of_Crimes|
+----------------+------------------+----------------+
|       HOLLYWOOD|2076.2639601787287|          224340|
|        VAN NUYS| 2953.369742819791|          210134|
|       SOUTHWEST| 2191.398805780876|          188901|
|        WILSHIRE| 2592.665532978779|          185996|
|     77TH STREET|1716.5449719701091|          171827|
|         OLYMPIC|1723.6036971780964|          170897|
| NORTH HOLLYWOOD| 2643.006094141568|          167854|
|         PACIFIC|3850.0706553079117|          161359|
|         CENTRAL| 992.4764374568801|          153871|
|         RAMPART| 1534.534187919017|          152736|
|       SOUTHEAST|2421.8662158881825|          152176|
|     WEST VALLEY| 3035.671216314059|          138643|
|         TOPANGA| 3296.954841755584|          138217|
|        FOOTHILL| 4250.921708425003|          134896|
|          HARBOR|3702.5615993564984|          126747|
|      HOL

In [42]:
elapsed_time_2_8_4 = end_time_1 - start_time_1
print(f"2 executors 8g memory 4 core: {elapsed_time_2_8_4:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2 executors 8g memory 4 core: 43.96 seconds

## Case No2

4 executors
4g memory
2 core

In [43]:
# Start Spark Session
spark = SparkSession.builder \
    .appName("Query 5") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [44]:
# paths
crime_data_2010_2019 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_2020 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
police_stations = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

# Load datasets
crime_df = spark.read.csv(crime_data_2010_2019, header=True, inferSchema=True).union(
    spark.read.csv(crime_data_2020, header=True, inferSchema=True))
police_stations_df = spark.read.csv(police_stations, header=True, inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [45]:
start_time_2 = time.time()

# Create sedona context
sedona = SedonaContext.create(spark)

# Create geometry column for crimes
crime_df = crime_df.withColumn("geom", ST_Point("LON", "LAT"))

# Create geometry column for police stations
police_stations_df = police_stations_df.withColumn("geom", ST_Point("X", "Y"))

# Show updated schemas
crime_df.printSchema()
police_stations_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer (nullable = true)
 |-- 

In [46]:
# Cross-join crimes and police stations to calculate distances
crime_distances_df = crime_df.crossJoin(police_stations_df) \
    .withColumn("distance", ST_DistanceSphere(crime_df["geom"], police_stations_df["geom"]))

# Show the calculated distances
crime_distances_df.show(4)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------+--------+--------+------------+---------+--------------------+--------------+-----------+------+------------+--------+--------+--------+--------+--------------------+------------+-------+---------+--------------------+--------------+----------------+---+-----------+--------------------+----+--------------------+------------------+
|  DR_NO|           Date Rptd|            DATE OCC|TIME OCC|AREA |AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|       Mocodes|Vict Age|Vict Sex|Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|Weapon Desc|Status| Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Street|    LAT|      LON|                geom|             X|               Y|FID|   DIVISION|            LOCATION|PREC|                geom|          distance|
+-------+--------------------+--------------------+---

In [47]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# Create a window partitioned by crime DR_NO, ordered by distance
window_spec = Window.partitionBy("DR_NO").orderBy("distance")

# Select the closest police station for each crime
closest_station_df = crime_distances_df.withColumn("rank", row_number().over(window_spec)) \
    .filter(col("rank") == 1) \
    .select("DR_NO", "DIVISION", "distance")

# Show the closest police stations
closest_station_df.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+------------------+
|    DR_NO| DIVISION|          distance|
+---------+---------+------------------+
| 91231932|SOUTHWEST|2257.2533220281193|
|100100523|  CENTRAL| 1300.733040644066|
|100100608|  RAMPART|1140.0611312527299|
|100100751|  CENTRAL|221.00932828621197|
|100100842|  RAMPART|1151.6898086067715|
|100100853|  CENTRAL|1089.0413188582168|
|100100857|  CENTRAL| 698.8861729382769|
|100100925|  CENTRAL| 271.6366601173468|
|100100950|  CENTRAL| 521.3846970614009|
|100100980|  CENTRAL| 724.6837344896162|
|100101125|  CENTRAL| 674.4711458026111|
|100101136|  CENTRAL|1983.2012674561922|
|100101182|  CENTRAL| 371.9092348906933|
|100101283|  CENTRAL| 313.3676894974406|
|100101422|  CENTRAL| 911.6183493737415|
|100101429|  CENTRAL| 552.5738180020111|
|100101685|  CENTRAL|1721.0833118567502|
|100104055|  CENTRAL|431.77897375232095|
|100104099|  CENTRAL| 271.6366601173468|
|100104146|  CENTRAL| 526.1458976350942|
+---------+---------+------------------+
only showing top

In [48]:
from pyspark.sql.functions import avg, count, col

# Aggregate crimes and distances by police station
station_crime_stats_df = closest_station_df.groupBy("DIVISION") \
    .agg(
        count("DR_NO").alias("Number_of_Crimes"),
        avg("distance").alias("Average_Distance")
    ) \
    .orderBy(col("Number_of_Crimes").desc())

# Reorder the columns
station_crime_stats_df = station_crime_stats_df.select(
    "DIVISION",
    "Average_Distance",
    "Number_of_Crimes"
)

# Show the final results
station_crime_stats_df.show()

end_time_2 = time.time()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----------------+
|        DIVISION|  Average_Distance|Number_of_Crimes|
+----------------+------------------+----------------+
|       HOLLYWOOD| 2076.263960178729|          224340|
|        VAN NUYS|2953.3697428197906|          210134|
|       SOUTHWEST| 2191.398805780876|          188901|
|        WILSHIRE| 2592.665532978779|          185996|
|     77TH STREET|1716.5449719701087|          171827|
|         OLYMPIC|1723.6036971780966|          170897|
| NORTH HOLLYWOOD| 2643.006094141569|          167854|
|         PACIFIC| 3850.070655307912|          161359|
|         CENTRAL| 992.4764374568799|          153871|
|         RAMPART|1534.5341879190173|          152736|
|       SOUTHEAST| 2421.866215888182|          152176|
|     WEST VALLEY|3035.6712163140596|          138643|
|         TOPANGA|3296.9548417555843|          138217|
|        FOOTHILL| 4250.921708425003|          134896|
|          HARBOR|3702.5615993564984|          126747|
|      HOL

In [49]:
elapsed_time_4_4_2 = end_time_2 - start_time_2
print(f"4 executors 4g memory 2 core: {elapsed_time_4_4_2:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

4 executors 4g memory 2 core: 21.06 seconds

## Case No2

8 executors
2g memory
1 core

In [50]:
# Start Spark Session
spark = SparkSession.builder \
    .appName("Query 5") \
    .config("spark.executor.instances", "8") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [90]:
# paths
crime_data_2010_2019 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_2020 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
police_stations = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

# Load datasets
crime_df = spark.read.csv(crime_data_2010_2019, header=True, inferSchema=True).union(
    spark.read.csv(crime_data_2020, header=True, inferSchema=True))
police_stations_df = spark.read.csv(police_stations, header=True, inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [91]:
start_time_3 = time.time()

# Create sedona context
sedona = SedonaContext.create(spark)

# Create geometry column for crimes
crime_df = crime_df.withColumn("geom", ST_Point("LON", "LAT"))

# Create geometry column for police stations
police_stations_df = police_stations_df.withColumn("geom", ST_Point("X", "Y"))

# Show updated schemas
crime_df.printSchema()
police_stations_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer (nullable = true)
 |-- 

In [92]:
# Cross-join crimes and police stations to calculate distances
crime_distances_df = crime_df.crossJoin(police_stations_df) \
    .withColumn("distance", ST_DistanceSphere(crime_df["geom"], police_stations_df["geom"]))

# Show the calculated distances
crime_distances_df.show(4)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------+--------+--------+------------+---------+--------------------+--------------+-----------+------+------------+--------+--------+--------+--------+--------------------+------------+-------+---------+--------------------+--------------+----------------+---+-----------+--------------------+----+--------------------+------------------+
|  DR_NO|           Date Rptd|            DATE OCC|TIME OCC|AREA |AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|       Mocodes|Vict Age|Vict Sex|Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|Weapon Desc|Status| Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Street|    LAT|      LON|                geom|             X|               Y|FID|   DIVISION|            LOCATION|PREC|                geom|          distance|
+-------+--------------------+--------------------+---

In [93]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# Create a window partitioned by crime DR_NO, ordered by distance
window_spec = Window.partitionBy("DR_NO").orderBy("distance")

# Select the closest police station for each crime
closest_station_df = crime_distances_df.withColumn("rank", row_number().over(window_spec)) \
    .filter(col("rank") == 1) \
    .select("DR_NO", "DIVISION", "distance")

# Show the closest police stations
closest_station_df.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+------------------+
|    DR_NO| DIVISION|          distance|
+---------+---------+------------------+
| 91231932|SOUTHWEST|2257.2533220281193|
|100100523|  CENTRAL| 1300.733040644066|
|100100608|  RAMPART|1140.0611312527299|
|100100751|  CENTRAL|221.00932828621197|
|100100853|  CENTRAL|1089.0413188582168|
|100100857|  CENTRAL| 698.8861729382769|
|100100925|  CENTRAL| 271.6366601173468|
|100100950|  CENTRAL| 521.3846970614009|
|100100980|  CENTRAL| 724.6837344896162|
|100101125|  CENTRAL| 674.4711458026111|
|100101136|  CENTRAL|1983.2012674561922|
|100101283|  CENTRAL| 313.3676894974406|
|100101422|  CENTRAL| 911.6183493737415|
|100101429|  CENTRAL| 552.5738180020111|
|100101685|  CENTRAL|1721.0833118567502|
|100104055|  CENTRAL|431.77897375232095|
|100104099|  CENTRAL| 271.6366601173468|
|100104146|  CENTRAL| 526.1458976350942|
|100104254|  CENTRAL| 271.6366601173468|
|100104272|  CENTRAL| 886.7059823960111|
+---------+---------+------------------+
only showing top

In [94]:
from pyspark.sql.functions import avg, count, col

# Aggregate crimes and distances by police station
station_crime_stats_df = closest_station_df.groupBy("DIVISION") \
    .agg(
        count("DR_NO").alias("Number_of_Crimes"),
        avg("distance").alias("Average_Distance")
    ) \
    .orderBy(col("Number_of_Crimes").desc())

# Reorder the columns
station_crime_stats_df = station_crime_stats_df.select(
    "DIVISION",
    "Average_Distance",
    "Number_of_Crimes"
)

# Show the final results
station_crime_stats_df.show()

end_time_3 = time.time()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----------------+
|        DIVISION|  Average_Distance|Number_of_Crimes|
+----------------+------------------+----------------+
|       HOLLYWOOD|2076.2639601787287|          224340|
|        VAN NUYS| 2953.369742819791|          210134|
|       SOUTHWEST| 2191.398805780876|          188901|
|        WILSHIRE|2592.6655329787786|          185996|
|     77TH STREET|1716.5449719701082|          171827|
|         OLYMPIC|1723.6036971780966|          170897|
| NORTH HOLLYWOOD|2643.0060941415677|          167854|
|         PACIFIC| 3850.070655307912|          161359|
|         CENTRAL| 992.4764374568799|          153871|
|         RAMPART|1534.5341879190166|          152736|
|       SOUTHEAST|2421.8662158881816|          152176|
|     WEST VALLEY|  3035.67121631406|          138643|
|         TOPANGA|3296.9548417555843|          138217|
|        FOOTHILL| 4250.921708425002|          134896|
|          HARBOR| 3702.561599356499|          126747|
|      HOL

In [95]:
elapsed_time_8_2_1 = end_time_3 - start_time_3
print(f"8 executors 2g memory 1 core: {elapsed_time_8_2_1:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

8 executors 2g memory 1 core: 16.92 seconds

## Comparison

In [96]:
print(f"2 executors 8g memory 4 core: {elapsed_time_2_8_4:.2f} seconds")
print(f"4 executors 4g memory 2 core: {elapsed_time_4_4_2:.2f} seconds")
print(f"8 executors 2g memory 1 core: {elapsed_time_8_2_1:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2 executors 8g memory 4 core: 43.96 seconds
4 executors 4g memory 2 core: 21.06 seconds
8 executors 2g memory 1 core: 16.92 seconds

In [97]:
# Find the minimum elapsed time
min_elapsed_time = min(elapsed_time_2_8_4, elapsed_time_4_4_2, elapsed_time_8_2_1)

# Print the minimum elapsed time
print(f"The minimum elapsed time is:  {min_elapsed_time}")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

The minimum elapsed time is:  16.915506601333618