In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.window import Window
import time  # <<< timing
from sedona.register import SedonaRegistrator

spark = (
    SparkSession.builder
    .appName("Query4 - Nearest division per crime (Sedona)")
    .config("spark.executor.instances", "2")
    .config("spark.executor.cores", "1")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)

_T0 = time.time()  # <<< start timer

SedonaRegistrator.registerAll(spark)


crime_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"   # 2010–2019
crime_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"   # 2020–2025
division_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv"                    # division data

crime_df_1 = spark.read.option("header", "true").csv(crime_path_1)
crime_df_2 = spark.read.option("header", "true").csv(crime_path_2)
crime_df = crime_df_1.unionByName(crime_df_2)

division_df = spark.read.option("header", "true").csv(division_path)

# Crime columns: DR_NO, LAT, LON
crime_df = (
    crime_df
    .withColumnRenamed("DR_NO", "crime_id")
    .withColumn("crime_lat", F.col("LAT").cast("double"))
    .withColumn("crime_lon", F.col("LON").cast("double")) # Null Coords filter
    .filter(F.col("crime_lat").isNotNull() & F.col("crime_lon").isNotNull()) # Null Island (0,0) filter
    .filter(~((F.col("crime_lat") == 0.0) & (F.col("crime_lon") == 0.0)))
)

# Division columns: DIVISION, X=lon, Y=lat
division_df = (
    division_df
    .withColumnRenamed("DIVISION", "division")
    .withColumn("division_lat", F.col("Y").cast("double"))
    .withColumn("division_lon", F.col("X").cast("double"))
    .filter(F.col("division_lat").isNotNull() & F.col("division_lon").isNotNull())
    .filter(~((F.col("division_lat") == 0.0) & (F.col("division_lon") == 0.0)))
)

# Sedona Geometries

crime_df = crime_df.withColumn(
    "crime_geom",
    F.expr("ST_Point(cast(crime_lon as double), cast(crime_lat as double))")
)

division_df = division_df.withColumn(
    "division_geom",
    F.expr("ST_Point(cast(division_lon as double), cast(division_lat as double))")
)

# Distance Join

crime_div_df = (
    crime_df.crossJoin(division_df)
    .withColumn(
        "distance_deg",
        F.expr("ST_Distance(crime_geom, division_geom)")
    )
    .withColumn(
        "distance",
        F.col("distance_deg") * F.lit(111.195) # degrees to kilometers
    )
)

print("=== PLAN: CROSS JOIN + DISTANCE COMPUTATION (Sedona) ===")
crime_div_df.explain(mode="formatted")

# Nearest Division per Crime

w = Window.partitionBy("crime_id").orderBy(F.col("distance").asc())

nearest_df = (
    crime_div_df
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .drop("rn")
)

print("=== PLAN: WINDOW (nearest division per crime) ===")
nearest_df.explain(mode="formatted")

# Final Group - Avg distance calculation

result_df = (
    nearest_df
    .groupBy("division")
    .agg(
        F.round(F.avg("distance"), 3).alias("average_distance"),
        F.count("*").alias("#")
    )
    .orderBy(F.col("#").desc())
)

print("=== PLAN: FINAL AGGREGATION (count + avg distance) ===")
result_df.explain(mode="extended")


total = result_df.count()
result_df.show(total, truncate=False)
TOTAL_SEC = time.time() - _T0
print(f"\n>>> [timing] TOTAL EXECUTION TIME: {TOTAL_SEC:.2f}s")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1759,application_1765289937462_1743,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

=== PLAN: CROSS JOIN + DISTANCE COMPUTATION (Sedona) ===
== Physical Plan ==
AdaptiveSparkPlan (18)
+- Project (17)
   +- Project (16)
      +- BroadcastNestedLoopJoin Cross BuildRight (15)
         :- Union (9)
         :  :- Project (4)
         :  :  +- Project (3)
         :  :     +- Filter (2)
         :  :        +- Scan csv  (1)
         :  +- Project (8)
         :     +- Project (7)
         :        +- Filter (6)
         :           +- Scan csv  (5)
         +- BroadcastExchange (14)
            +- Project (13)
               +- Project (12)
                  +- Filter (11)
                     +- Scan csv  (10)


(1) Scan csv 
Output [28]: [DR_NO#42, Date Rptd#43, DATE OCC#44, TIME OCC#45, AREA#46, AREA NAME#47, Rpt Dist No#48, Part 1-2#49, Crm Cd#50, Crm Cd Desc#51, Mocodes#52, Vict Age#53, Vict Sex#54, Vict Descent#55, Premis Cd#56, Premis Desc#57, Weapon Used Cd#58, Weapon Desc#59, Status#60, Status Desc#61, Crm Cd 1#62, Crm Cd 2#63, Crm Cd 3#64, Crm Cd 4#65, LOCATION#6

In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.window import Window
import time  # <<< timing
from sedona.register import SedonaRegistrator

spark = (
    SparkSession.builder
    .appName("Query4 - Nearest division per crime (Sedona)")
    .config("spark.executor.instances", "2")
    .config("spark.executor.cores", "2")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

_T0 = time.time()  # <<< start timer

SedonaRegistrator.registerAll(spark)


crime_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"   # 2010–2019
crime_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"   # 2020–2025
division_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv"                    # division data

crime_df_1 = spark.read.option("header", "true").csv(crime_path_1)
crime_df_2 = spark.read.option("header", "true").csv(crime_path_2)
crime_df = crime_df_1.unionByName(crime_df_2)

division_df = spark.read.option("header", "true").csv(division_path)

# Crime columns: DR_NO, LAT, LON
crime_df = (
    crime_df
    .withColumnRenamed("DR_NO", "crime_id")
    .withColumn("crime_lat", F.col("LAT").cast("double"))
    .withColumn("crime_lon", F.col("LON").cast("double")) # Null Coords filter
    .filter(F.col("crime_lat").isNotNull() & F.col("crime_lon").isNotNull()) # Null Island (0,0) filter
    .filter(~((F.col("crime_lat") == 0.0) & (F.col("crime_lon") == 0.0)))
)

# Division columns: DIVISION, X=lon, Y=lat
division_df = (
    division_df
    .withColumnRenamed("DIVISION", "division")
    .withColumn("division_lat", F.col("Y").cast("double"))
    .withColumn("division_lon", F.col("X").cast("double"))
    .filter(F.col("division_lat").isNotNull() & F.col("division_lon").isNotNull())
    .filter(~((F.col("division_lat") == 0.0) & (F.col("division_lon") == 0.0)))
)

# Sedona Geometries

crime_df = crime_df.withColumn(
    "crime_geom",
    F.expr("ST_Point(cast(crime_lon as double), cast(crime_lat as double))")
)

division_df = division_df.withColumn(
    "division_geom",
    F.expr("ST_Point(cast(division_lon as double), cast(division_lat as double))")
)

# Distance Join

crime_div_df = (
    crime_df.crossJoin(division_df)
    .withColumn(
        "distance_deg",
        F.expr("ST_Distance(crime_geom, division_geom)")
    )
    .withColumn(
        "distance",
        F.col("distance_deg") * F.lit(111.195) # degrees to kilometers
    )
)

print("=== PLAN: CROSS JOIN + DISTANCE COMPUTATION (Sedona) ===")
crime_div_df.explain(mode="formatted")

# Nearest Division per Crime

w = Window.partitionBy("crime_id").orderBy(F.col("distance").asc())

nearest_df = (
    crime_div_df
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .drop("rn")
)

print("=== PLAN: WINDOW (nearest division per crime) ===")
nearest_df.explain(mode="formatted")

# Final Group - Avg distance calculation

result_df = (
    nearest_df
    .groupBy("division")
    .agg(
        F.round(F.avg("distance"), 3).alias("average_distance"),
        F.count("*").alias("#")
    )
    .orderBy(F.col("#").desc())
)

print("=== PLAN: FINAL AGGREGATION (count + avg distance) ===")
result_df.explain(mode="extended")


total = result_df.count()
result_df.show(total, truncate=False)
TOTAL_SEC = time.time() - _T0
print(f"\n>>> [timing] TOTAL EXECUTION TIME: {TOTAL_SEC:.2f}s")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1727,application_1765289937462_1711,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

=== PLAN: CROSS JOIN + DISTANCE COMPUTATION (Sedona) ===
== Physical Plan ==
AdaptiveSparkPlan (18)
+- Project (17)
   +- Project (16)
      +- BroadcastNestedLoopJoin Cross BuildRight (15)
         :- Union (9)
         :  :- Project (4)
         :  :  +- Project (3)
         :  :     +- Filter (2)
         :  :        +- Scan csv  (1)
         :  +- Project (8)
         :     +- Project (7)
         :        +- Filter (6)
         :           +- Scan csv  (5)
         +- BroadcastExchange (14)
            +- Project (13)
               +- Project (12)
                  +- Filter (11)
                     +- Scan csv  (10)


(1) Scan csv 
Output [28]: [DR_NO#42, Date Rptd#43, DATE OCC#44, TIME OCC#45, AREA#46, AREA NAME#47, Rpt Dist No#48, Part 1-2#49, Crm Cd#50, Crm Cd Desc#51, Mocodes#52, Vict Age#53, Vict Sex#54, Vict Descent#55, Premis Cd#56, Premis Desc#57, Weapon Used Cd#58, Weapon Desc#59, Status#60, Status Desc#61, Crm Cd 1#62, Crm Cd 2#63, Crm Cd 3#64, Crm Cd 4#65, LOCATION#6

In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.window import Window
import time  # <<< timing
from sedona.register import SedonaRegistrator

spark = (
    SparkSession.builder
    .appName("Query4 - Nearest division per crime (Sedona)")
    .config("spark.executor.instances", "2")
    .config("spark.executor.cores", "4")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

_T0 = time.time()  # <<< start timer

SedonaRegistrator.registerAll(spark)


crime_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"   # 2010–2019
crime_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"   # 2020–2025
division_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv"                    # division data

crime_df_1 = spark.read.option("header", "true").csv(crime_path_1)
crime_df_2 = spark.read.option("header", "true").csv(crime_path_2)
crime_df = crime_df_1.unionByName(crime_df_2)

division_df = spark.read.option("header", "true").csv(division_path)

# Crime columns: DR_NO, LAT, LON
crime_df = (
    crime_df
    .withColumnRenamed("DR_NO", "crime_id")
    .withColumn("crime_lat", F.col("LAT").cast("double"))
    .withColumn("crime_lon", F.col("LON").cast("double")) # Null Coords filter
    .filter(F.col("crime_lat").isNotNull() & F.col("crime_lon").isNotNull()) # Null Island (0,0) filter
    .filter(~((F.col("crime_lat") == 0.0) & (F.col("crime_lon") == 0.0)))
)

# Division columns: DIVISION, X=lon, Y=lat
division_df = (
    division_df
    .withColumnRenamed("DIVISION", "division")
    .withColumn("division_lat", F.col("Y").cast("double"))
    .withColumn("division_lon", F.col("X").cast("double"))
    .filter(F.col("division_lat").isNotNull() & F.col("division_lon").isNotNull())
    .filter(~((F.col("division_lat") == 0.0) & (F.col("division_lon") == 0.0)))
)

# Sedona Geometries

crime_df = crime_df.withColumn(
    "crime_geom",
    F.expr("ST_Point(cast(crime_lon as double), cast(crime_lat as double))")
)

division_df = division_df.withColumn(
    "division_geom",
    F.expr("ST_Point(cast(division_lon as double), cast(division_lat as double))")
)

# Distance Join

crime_div_df = (
    crime_df.crossJoin(division_df)
    .withColumn(
        "distance_deg",
        F.expr("ST_Distance(crime_geom, division_geom)")
    )
    .withColumn(
        "distance",
        F.col("distance_deg") * F.lit(111.195) # degrees to kilometers
    )
)

print("=== PLAN: CROSS JOIN + DISTANCE COMPUTATION (Sedona) ===")
crime_div_df.explain(mode="formatted")

# Nearest Division per Crime

w = Window.partitionBy("crime_id").orderBy(F.col("distance").asc())

nearest_df = (
    crime_div_df
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .drop("rn")
)

print("=== PLAN: WINDOW (nearest division per crime) ===")
nearest_df.explain(mode="formatted")

# Final Group - Avg distance calculation

result_df = (
    nearest_df
    .groupBy("division")
    .agg(
        F.round(F.avg("distance"), 3).alias("average_distance"),
        F.count("*").alias("#")
    )
    .orderBy(F.col("#").desc())
)

print("=== PLAN: FINAL AGGREGATION (count + avg distance) ===")
result_df.explain(mode="extended")


total = result_df.count()
result_df.show(total, truncate=False)
TOTAL_SEC = time.time() - _T0
print(f"\n>>> [timing] TOTAL EXECUTION TIME: {TOTAL_SEC:.2f}s")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1731,application_1765289937462_1715,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

=== PLAN: CROSS JOIN + DISTANCE COMPUTATION (Sedona) ===
== Physical Plan ==
AdaptiveSparkPlan (18)
+- Project (17)
   +- Project (16)
      +- BroadcastNestedLoopJoin Cross BuildRight (15)
         :- Union (9)
         :  :- Project (4)
         :  :  +- Project (3)
         :  :     +- Filter (2)
         :  :        +- Scan csv  (1)
         :  +- Project (8)
         :     +- Project (7)
         :        +- Filter (6)
         :           +- Scan csv  (5)
         +- BroadcastExchange (14)
            +- Project (13)
               +- Project (12)
                  +- Filter (11)
                     +- Scan csv  (10)


(1) Scan csv 
Output [28]: [DR_NO#42, Date Rptd#43, DATE OCC#44, TIME OCC#45, AREA#46, AREA NAME#47, Rpt Dist No#48, Part 1-2#49, Crm Cd#50, Crm Cd Desc#51, Mocodes#52, Vict Age#53, Vict Sex#54, Vict Descent#55, Premis Cd#56, Premis Desc#57, Weapon Used Cd#58, Weapon Desc#59, Status#60, Status Desc#61, Crm Cd 1#62, Crm Cd 2#63, Crm Cd 3#64, Crm Cd 4#65, LOCATION#6