In [1]:
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time


# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("Query 5 - Crime Analysis with Police Stations") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

# Create Sedona context
sedona = SedonaContext.create(spark)

# Start timing
start_time = time.time()

# Load datasets
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/", header=True)
station_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)

# Filter out Null Island records and create geometries for crime data
crime_data = crime_data.filter((col("LAT").isNotNull()) & (col("LON").isNotNull()) & (col("LAT") != 0) & (col("LON") != 0))
crime_data = crime_data.withColumn("crime_geometry", ST_Point(col("LON").cast("double"), col("LAT").cast("double")))

# Create geometries for police stations using X and Y columns
station_data = station_data.filter((col("X").isNotNull()) & (col("Y").isNotNull()))
station_data = station_data.withColumn("station_geometry", ST_Point(col("X").cast("double"), col("Y").cast("double")))

# Broadcast police stations for join efficiency
station_data_broadcast = station_data.select("station_geometry", "DIVISION").alias("stations")

# Join crime data with police stations to calculate distances
crime_with_distances = crime_data.crossJoin(station_data_broadcast) \
    .withColumn("distance", ST_Distance(col("crime_geometry"), col("stations.station_geometry")))

# Find the closest station for each crime using DR_NO as the unique identifier
window_spec = Window.partitionBy("DR_NO").orderBy(col("distance").asc())
closest_station = crime_with_distances.withColumn("row_number", row_number().over(window_spec)) \
    .filter(col("row_number") == 1) \
    .select("DR_NO", "DIVISION", "distance")

# Aggregate results by station
results = closest_station.groupBy("DIVISION").agg(
    avg("distance").alias("average_distance"),
    count("DR_NO").alias("#")
)

# Sort results by number of incidents
sorted_results = results.orderBy(desc("#"))

# Display the results
sorted_results.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

# Stop Spark session
#spark.stop()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2025,application_1732639283265_1985,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------------+------+
|        DIVISION|    average_distance|     #|
+----------------+--------------------+------+
|       HOLLYWOOD|0.020437790725485645|213080|
|        VAN NUYS|0.028653154590629126|211457|
|        WILSHIRE|0.026312166557481597|198150|
|       SOUTHWEST| 0.02157700118424315|186742|
|         OLYMPIC| 0.01729162112331338|180463|
| NORTH HOLLYWOOD| 0.02611521422256773|171159|
|     77TH STREET|0.016584871496068184|167323|
|         PACIFIC|0.037495777088312074|157468|
|         CENTRAL|0.009868086849235298|154474|
|       SOUTHEAST|0.024150127195506466|151999|
|         RAMPART|0.014730484635455721|149675|
|         TOPANGA| 0.03243890335156792|147167|
|     WEST VALLEY|0.028973607196407476|130933|
|          HARBOR| 0.03468867635155542|126749|
|        FOOTHILL| 0.04125740608010437|122515|
|WEST LOS ANGELES|0.029842606564859788|121074|
|      HOLLENBECK| 0.02640744523588532|119329|
|          NEWTON|0.015890866822603905|109078|
|         MIS

In [2]:
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time

# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("Query 5 - Crime Analysis with Police Stations") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()

# Create Sedona context
sedona = SedonaContext.create(spark)

# Start timing
start_time = time.time()

# Load datasets
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/", header=True)
station_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)

# Filter out Null Island records and create geometries for crime data
crime_data = crime_data.filter((col("LAT").isNotNull()) & (col("LON").isNotNull()) & (col("LAT") != 0) & (col("LON") != 0))
crime_data = crime_data.withColumn("crime_geometry", ST_Point(col("LON").cast("double"), col("LAT").cast("double")))

# Create geometries for police stations using X and Y columns
station_data = station_data.filter((col("X").isNotNull()) & (col("Y").isNotNull()))
station_data = station_data.withColumn("station_geometry", ST_Point(col("X").cast("double"), col("Y").cast("double")))

# Broadcast police stations for join efficiency
station_data_broadcast = station_data.select("station_geometry", "DIVISION").alias("stations")

# Join crime data with police stations to calculate distances
crime_with_distances = crime_data.crossJoin(station_data_broadcast) \
    .withColumn("distance", ST_Distance(col("crime_geometry"), col("stations.station_geometry")))

# Find the closest station for each crime using DR_NO as the unique identifier
window_spec = Window.partitionBy("DR_NO").orderBy(col("distance").asc())
closest_station = crime_with_distances.withColumn("row_number", row_number().over(window_spec)) \
    .filter(col("row_number") == 1) \
    .select("DR_NO", "DIVISION", "distance")

# Aggregate results by station
results = closest_station.groupBy("DIVISION").agg(
    avg("distance").alias("average_distance"),
    count("DR_NO").alias("#")
)

# Sort results by number of incidents
sorted_results = results.orderBy(desc("#"))

# Display the results
sorted_results.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

# Stop Spark session
#spark.stop()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------------+------+
|        DIVISION|    average_distance|     #|
+----------------+--------------------+------+
|       HOLLYWOOD| 0.02043779072548565|213080|
|        VAN NUYS|0.028653154590629115|211457|
|        WILSHIRE| 0.02631216655748158|198150|
|       SOUTHWEST|0.021577001184243147|186742|
|         OLYMPIC| 0.01729162112331338|180463|
| NORTH HOLLYWOOD| 0.02611521422256773|171159|
|     77TH STREET|0.016584871496068188|167323|
|         PACIFIC| 0.03749577708831209|157468|
|         CENTRAL|  0.0098680868492353|154474|
|       SOUTHEAST|0.024150127195506466|151999|
|         RAMPART|0.014730484635455721|149675|
|         TOPANGA| 0.03243890335156791|147167|
|     WEST VALLEY| 0.02897360719640747|130933|
|          HARBOR| 0.03468867635155542|126749|
|        FOOTHILL| 0.04125740608010438|122515|
|WEST LOS ANGELES|0.029842606564859795|121074|
|      HOLLENBECK| 0.02640744523588532|119329|
|          NEWTON|0.015890866822603905|109078|
|         MIS

In [3]:
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time

# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("Query 5 - Crime Analysis with Police Stations") \
    .config("spark.executor.instances", "8") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate()

# Create Sedona context
sedona = SedonaContext.create(spark)

# Start timing
start_time = time.time()

# Load datasets
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/", header=True)
station_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)

# Filter out Null Island records and create geometries for crime data
crime_data = crime_data.filter((col("LAT").isNotNull()) & (col("LON").isNotNull()) & (col("LAT") != 0) & (col("LON") != 0))
crime_data = crime_data.withColumn("crime_geometry", ST_Point(col("LON").cast("double"), col("LAT").cast("double")))

# Create geometries for police stations using X and Y columns
station_data = station_data.filter((col("X").isNotNull()) & (col("Y").isNotNull()))
station_data = station_data.withColumn("station_geometry", ST_Point(col("X").cast("double"), col("Y").cast("double")))

# Broadcast police stations for join efficiency
station_data_broadcast = station_data.select("station_geometry", "DIVISION").alias("stations")

# Join crime data with police stations to calculate distances
crime_with_distances = crime_data.crossJoin(station_data_broadcast) \
    .withColumn("distance", ST_Distance(col("crime_geometry"), col("stations.station_geometry")))

# Find the closest station for each crime using DR_NO as the unique identifier
window_spec = Window.partitionBy("DR_NO").orderBy(col("distance").asc())
closest_station = crime_with_distances.withColumn("row_number", row_number().over(window_spec)) \
    .filter(col("row_number") == 1) \
    .select("DR_NO", "DIVISION", "distance")

# Aggregate results by station
results = closest_station.groupBy("DIVISION").agg(
    avg("distance").alias("average_distance"),
    count("DR_NO").alias("#")
)

# Sort results by number of incidents
sorted_results = results.orderBy(desc("#"))

# Display the results
sorted_results.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

# Stop Spark session
#spark.stop()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------------+------+
|        DIVISION|    average_distance|     #|
+----------------+--------------------+------+
|       HOLLYWOOD| 0.02043779072548565|213080|
|        VAN NUYS| 0.02865315459062912|211457|
|        WILSHIRE|0.026312166557481587|198150|
|       SOUTHWEST| 0.02157700118424315|186742|
|         OLYMPIC| 0.01729162112331338|180463|
| NORTH HOLLYWOOD|0.026115214222567722|171159|
|     77TH STREET|0.016584871496068184|167323|
|         PACIFIC|0.037495777088312074|157468|
|         CENTRAL|0.009868086849235298|154474|
|       SOUTHEAST|0.024150127195506462|151999|
|         RAMPART|0.014730484635455718|149675|
|         TOPANGA| 0.03243890335156791|147167|
|     WEST VALLEY|0.028973607196407472|130933|
|          HARBOR|0.034688676351555425|126749|
|        FOOTHILL| 0.04125740608010438|122515|
|WEST LOS ANGELES| 0.02984260656485979|121074|
|      HOLLENBECK|0.026407445235885316|119329|
|          NEWTON|  0.0158908668226039|109078|
|         MIS