In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, min as spark_min
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, min_by
from sedona.register import SedonaRegistrator
from sedona.sql.types import GeometryType
from sedona.spark import *
import time

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3000,application_1732639283265_2958,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
def run_query_5(executors, executor_cores, executor_memory):
        # Initialize Spark session
    spark = SparkSession.builder \
        .appName("CrimeAmountCloseToStation") \
        .master("yarn") \
        .config("spark.executor.instances", str(executors)) \
        .config("spark.executor.cores", str(executor_cores)) \
        .config("spark.executor.memory", executor_memory) \
        .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator") \
        .getOrCreate()

    # Register Sedona user-defined functions and types
    SedonaRegistrator.registerAll(spark)

    # Create Sedona context
    sedona = SedonaContext.create(spark)

    start_time_df = time.time()
    
    # Load 2010s and 2020s crime datasets
    crime_raw_1 = spark.read.csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
        header=True)
    crime_raw_2 = spark.read.csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
        header=True)
    
    # Combine the datasets and create the crime point column
    crimes = (crime_raw_1
              .union(crime_raw_2)
              .select(
                  ST_Point("LON", "LAT").alias("crime_point"),
                  col("DR_NO").alias("crime_id")))

    # Load the police station dataset and create a station point column
    stations = (spark
                .read
                .csv(
                    "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv",
                    header=True)
                .select(ST_Point("X", "Y").alias("station_point"),
                        col("DIVISION").alias("division"),
                        col("FID").alias("station_id")))

    # Pair crimes with all stations and compute distances
    crime_station_distances = (crimes.crossJoin(stations)
                               .select(
                                   col("crime_id"),
                                   col("station_id"),
                                   ST_DistanceSphere(col("crime_point"), col("station_point")).alias("distance")
                               ))

    # Find the closest station for each crime
    closest_stations = crime_station_distances \
        .groupBy("crime_id") \
        .agg(
            spark_min(col("distance")).alias("min_distance"),
            min_by("station_id", "distance").alias("closest_station_id")
        )

    # Count crimes and calculate average distances per station
    station_stats = closest_stations \
        .groupBy("closest_station_id") \
        .agg(
            count("*").alias("crime_count"),
            avg("min_distance").alias("avg_distance")
        )

    results = (station_stats
               .join(
                   stations,
                   stations["station_id"] == station_stats["closest_station_id"])
               .select(
                   col("division"),
                   col("avg_distance").alias("average_distance"),
                   col("crime_count").alias("#"),
               )
               .orderBy(col("#").desc()))

    end_time_df = time.time()
    execution_time_df = end_time_df - start_time_df

    # Print the results
    print(f"\nRunning query with {executors} executor(s), {executor_cores} core(s) and {executor_memory} memory per executor...")
    print("The amount of crimes committed closer to that station than any other and their average distance from it")
    results.show(n=results.count(), truncate=False)
    print(f"Total execution time: {execution_time_df:.2f} seconds")

    # Stop the Spark session to ensure no caching affects subsequent runs
    spark.sparkContext.stop()
    spark.stop()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
executors =2
cores = 4
memory = "8g"
run_query_5(executors,cores, memory)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Running query with 2 executor(s), 4 core(s) and 8g memory per executor...
The amount of crimes committed closer to that station than any other and their average distance from it
+----------------+------------------+------+
|division        |average_distance  |#     |
+----------------+------------------+------+
|HOLLYWOOD       |2076.263960178729 |224340|
|VAN NUYS        |2953.369742819791 |210134|
|SOUTHWEST       |2191.3988057808765|188901|
|WILSHIRE        |2592.6655329787786|185996|
|77TH STREET     |1716.544971970108 |171827|
|OLYMPIC         |1723.6036971780964|170897|
|NORTH HOLLYWOOD |2643.0060941415686|167854|
|PACIFIC         |3850.070655307912 |161359|
|CENTRAL         |992.4764374568797 |153871|
|RAMPART         |1534.5341879190164|152736|
|SOUTHEAST       |2421.866215888182 |152176|
|WEST VALLEY     |3035.6712163140596|138643|
|TOPANGA         |3296.954841755584 |138217|
|FOOTHILL        |4250.921708425003 |134896|
|HARBOR          |3702.5615993564966|126747|
|HOLLENBECK

In [4]:
executors =4
cores = 2
memory = "4g"
run_query_5(executors,cores, memory)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Running query with 4 executor(s), 2 core(s) and 4g memory per executor...
The amount of crimes committed closer to that station than any other and their average distance from it
+----------------+------------------+------+
|division        |average_distance  |#     |
+----------------+------------------+------+
|HOLLYWOOD       |2076.2639601787296|224340|
|VAN NUYS        |2953.3697428197916|210134|
|SOUTHWEST       |2191.398805780876 |188901|
|WILSHIRE        |2592.665532978779 |185996|
|77TH STREET     |1716.5449719701076|171827|
|OLYMPIC         |1723.6036971780964|170897|
|NORTH HOLLYWOOD |2643.0060941415686|167854|
|PACIFIC         |3850.0706553079117|161359|
|CENTRAL         |992.4764374568801 |153871|
|RAMPART         |1534.5341879190166|152736|
|SOUTHEAST       |2421.8662158881825|152176|
|WEST VALLEY     |3035.6712163140596|138643|
|TOPANGA         |3296.9548417555848|138217|
|FOOTHILL        |4250.921708425002 |134896|
|HARBOR          |3702.561599356497 |126747|
|HOLLENBECK

In [3]:
executors =8
cores = 1
memory = "2g"
run_query_5(executors,cores, memory)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Running query with 8 executor(s), 1 core(s) and 2g memory per executor...
The amount of crimes committed closer to that station than any other and their average distance from it
+----------------+------------------+------+
|division        |average_distance  |#     |
+----------------+------------------+------+
|HOLLYWOOD       |2076.263960178729 |224340|
|VAN NUYS        |2953.3697428197916|210134|
|SOUTHWEST       |2191.398805780877 |188901|
|WILSHIRE        |2592.665532978779 |185996|
|77TH STREET     |1716.5449719701076|171827|
|OLYMPIC         |1723.6036971780964|170897|
|NORTH HOLLYWOOD |2643.006094141569 |167854|
|PACIFIC         |3850.070655307912 |161359|
|CENTRAL         |992.4764374568801 |153871|
|RAMPART         |1534.5341879190164|152736|
|SOUTHEAST       |2421.866215888182 |152176|
|WEST VALLEY     |3035.6712163140583|138643|
|TOPANGA         |3296.9548417555843|138217|
|FOOTHILL        |4250.921708425001 |134896|
|HARBOR          |3702.561599356497 |126747|
|HOLLENBECK