# Query 5

In [1]:
from pyspark.sql import SparkSession
from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import min, count, avg

# Initialize Spark session with specified configurations
# 2 executor x 4 cores/8GB memory
spark = SparkSession.builder \
    .appName("Query 5 Resource Experiment") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.cores", 4) \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "16g") \
    .getOrCreate()

# 4 executors × 2 cores/4GB memory
spark = SparkSession.builder \
    .config("spark.executor.instances", 4) \
    .config("spark.executor.cores", 2) \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# 8 executors × 1 core/2GB memory
spark = SparkSession.builder \
    .config("spark.executor.instances", 8) \
    .config("spark.executor.cores", 1) \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)

# Load the crime data, LA income and RE codes csv 
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
police_stations = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True, inferSchema=True)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
378,application_1732639283265_0347,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Geometry
Calculate the location point of the police stations and the crime scenes

In [2]:
print(police_stations)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[X: double, Y: double, FID: int, DIVISION: string, LOCATION: string, PREC: int]

In [3]:
# Add a column containing the police station location points 
police_stations = police_stations.withColumn("station_point", ST_Point(col("X"), col("Y")))
crime_data = crime_data.withColumn("crime_point", ST_Point(col("LON"), col("LAT")))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Cross join crimes with police stations to calculate distances
crime_distances = crime_data.crossJoin(police_stations)
crime_distances = crime_distances.withColumn("distance", ST_Distance(col("crime_point"), col("station_point")))
print(crime_distances)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[DR_NO: int, Date Rptd: string, DATE OCC: string, TIME OCC: int, AREA : int, AREA NAME: string, Rpt Dist No: int, Part 1-2: int, Crm Cd: int, Crm Cd Desc: string, Mocodes: string, Vict Age: int, Vict Sex: string, Vict Descent: string, Premis Cd: int, Premis Desc: string, Weapon Used Cd: int, Weapon Desc: string, Status: string, Status Desc: string, Crm Cd 1: int, Crm Cd 2: int, Crm Cd 3: int, Crm Cd 4: int, LOCATION: string, Cross Street: string, LAT: double, LON: double, crime_point: udt, X: double, Y: double, FID: int, DIVISION: string, LOCATION: string, PREC: int, station_point: udt, distance: double]

In [5]:
# Find the closest police station for each crime
closest_station_crimes = crime_distances.groupBy("DR_NO").agg(
    min("distance").alias("minimum_distance")
)
print(closest_station_crimes)
print(crime_distances)
closest_station_crimes.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[DR_NO: int, minimum_distance: double]
DataFrame[DR_NO: int, Date Rptd: string, DATE OCC: string, TIME OCC: int, AREA : int, AREA NAME: string, Rpt Dist No: int, Part 1-2: int, Crm Cd: int, Crm Cd Desc: string, Mocodes: string, Vict Age: int, Vict Sex: string, Vict Descent: string, Premis Cd: int, Premis Desc: string, Weapon Used Cd: int, Weapon Desc: string, Status: string, Status Desc: string, Crm Cd 1: int, Crm Cd 2: int, Crm Cd 3: int, Crm Cd 4: int, LOCATION: string, Cross Street: string, LAT: double, LON: double, crime_point: udt, X: double, Y: double, FID: int, DIVISION: string, LOCATION: string, PREC: int, station_point: udt, distance: double]
+---------+--------------------+
|    DR_NO|    minimum_distance|
+---------+--------------------+
|130208730|0.009986557225297343|
|130208787| 0.02931662261625151|
|130208832|0.001899079096345...|
|130209074|0.016979636963838463|
|130209107|0.014425854158906992|
|130209166|0.027496466082455765|
|130209167|0.010310265448288187|
|

In [7]:
# Rename DR_NO in one DataFrame to avoid ambiguity
crime_distances = crime_distances.withColumnRenamed("DR_NO", "crime_DR_NO")

# Filter the rows of the joint crimes and distances to only keep
# the closest station for each crime 
closest_station_final = crime_distances.join(closest_station_crimes, 
    ((crime_distances["crime_DR_NO"] == closest_station_crimes["DR_NO"]) &
     (crime_distances["distance"] == closest_station_crimes["minimum_distance"])))

# Only keep the columns we need for the final group and count 
closest_station_final = closest_station_final.select("DR_NO", "division", "distance")

print(closest_station_final)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[DR_NO: int, division: string, distance: double]

In [8]:
from pyspark.sql.functions import min, count

# Group the closest_station_final by police station and calculate the count of 
# crimes commited closest to them and the average distance 

result = closest_station_final.groupBy("division").agg(
    count("DR_NO").alias("#"), 
    avg("distance").alias("average distance")
).orderBy(col("#").desc())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
result.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------+--------------------+
|        division|     #|    average distance|
+----------------+------+--------------------+
|        VAN NUYS|148946|0.028563399035369762|
|       HOLLYWOOD|140927|0.020444060380655465|
|       SOUTHWEST|133420|0.021628874258637697|
|        WILSHIRE|132967| 0.02632706717398014|
|         OLYMPIC|119636|0.017338152570481328|
| NORTH HOLLYWOOD|118938|0.026308012004550223|
|     77TH STREET|116946| 0.01663294606463055|
|       SOUTHEAST|105162| 0.02403583311415142|
|         PACIFIC|104090| 0.03740800886087529|
|         TOPANGA|103828|  0.0323335951674802|
|         RAMPART| 95479|0.014934307019218166|
|         CENTRAL| 93531| 0.00949548266780149|
|     WEST VALLEY| 90132|0.028989371552462475|
|          HARBOR| 89176|  1.2869668064902087|
|        FOOTHILL| 87497| 0.04125249859939292|
|      HOLLENBECK| 81541|0.026781603358714164|
|WEST LOS ANGELES| 79670|0.030343153555294683|
|         MISSION| 78753|0.035104180437474077|
|       NORTH