# Query 5

In [7]:
from sedona.spark import *
from pyspark import SparkContext
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import min, count, avg
import time
import logging

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
def init_spark(app_name, cores, memory, executors):
    global spark
    spark = SparkSession.builder \
        .appName(app_name) \
        .config("spark.master", "yarn") \
        .config("spark.executor.instances", executors) \
        .config("spark.executor.cores", cores) \
        .config("spark.executor.memory", memory) \
        .getOrCreate()
    
    sedona = SedonaContext.create(spark)
    
    return spark, sedona

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
def load_data(spark):
    crime_data = spark.read.csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
        header=True,
        inferSchema=True,
    )
    police_stations = spark.read.csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv",
        header=True,
        inferSchema=True,
    )
    return crime_data, police_stations


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
def process_query5(crime_data, police_stations):
    # Add geometry columns
    police_stations = police_stations.withColumn("station_point", ST_Point(col("X"), col("Y")))
    crime_data = crime_data.withColumn("crime_point", ST_Point(col("LON"), col("LAT")))
    
    # Cross join and calculate distances
    crime_distances = crime_data.crossJoin(police_stations) \
        .withColumn("distance", ST_Distance(col("crime_point"), col("station_point")))

    # Find closest police station for each crime
    closest_station_crimes = crime_distances.groupBy("DR_NO").agg(
        min("distance").alias("minimum_distance")
    )
    
    # Filter for closest stations
    crime_distances = crime_distances.withColumnRenamed("DR_NO", "crime_DR_NO")
    closest_station_final = crime_distances.join(
        closest_station_crimes,
        ((crime_distances["crime_DR_NO"] == closest_station_crimes["DR_NO"]) &
         (crime_distances["distance"] == closest_station_crimes["minimum_distance"]))
    ).select("DR_NO", "DIVISION", "distance")
    
    # Group by police division and calculate metrics
    result = closest_station_final.groupBy("DIVISION").agg(
        count("DR_NO").alias("crime_count"),
        avg("distance").alias("average_distance")
    ).orderBy(col("crime_count").desc())
    
    return result

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
def run_query5(config_name, cores, memory, executors):
    print(f"Running Query 5 with configuration: {config_name}")
    spark, sedona = init_spark(f"Query 5 - {config_name}", cores, memory, executors)
    crime_data, police_stations = load_data(spark)
    start_time = time.time()
    result = process_query5(crime_data, police_stations)
    
    result.show(truncate=False)
    end_time = time.time()
    
    print(f"Execution Time: {end_time - start_time:.2f} seconds\n")

    return result

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
configs = [
    {"name": "2 executors × 4 cores/8GB memory", "cores": 4, "memory": "8g", "executors": 2},
    {"name": "4 executors × 2 cores/4GB memory", "cores": 2, "memory": "4g", "executors": 4},
    {"name": "8 executors × 1 core/2GB memory", "cores": 1, "memory": "2g", "executors": 8},
]

# Execute Query 5 for all configurations
results = {}
for config in configs:
    results[config["name"]] = run_query5(config["name"], config["cores"], config["memory"], config["executors"])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Running Query 5 with configuration: 2 executors ? 4 cores/8GB memory
+----------------+-----------+--------------------+
|DIVISION        |crime_count|average_distance    |
+----------------+-----------+--------------------+
|VAN NUYS        |148946     |0.028563399035369776|
|HOLLYWOOD       |140927     |0.02044406038065546 |
|SOUTHWEST       |133420     |0.021628874258637708|
|WILSHIRE        |132967     |0.02632706717398013 |
|OLYMPIC         |119636     |0.017338152570481335|
|NORTH HOLLYWOOD |118938     |0.02630801200455022 |
|77TH STREET     |116946     |0.016632946064630547|
|SOUTHEAST       |105162     |0.024035833114151407|
|PACIFIC         |104090     |0.037408008860875315|
|TOPANGA         |103828     |0.03233359516748021 |
|RAMPART         |95479      |0.014934307019218182|
|CENTRAL         |93531      |0.00949548266780149 |
|WEST VALLEY     |90132      |0.02898937155246247 |
|HARBOR          |89176      |1.2869668064902091  |
|FOOTHILL        |87497      |0.041252498599392