In [4]:
from sedona.spark import *
from pyspark.sql.functions import col, count, row_number, min, sum, avg, monotonically_increasing_id
from pyspark.sql import SparkSession, Window
import time

# Create spark Session
spark = SparkSession \
    .builder \
    .appName("Query 5 with 4 core/8 Gb memory") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)

# Read crime and police stations dataframes
crime_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
ps_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

crime_df_1 = spark.read.csv(crime_path_1, header=True, inferSchema=True)
crime_df_2 = spark.read.csv(crime_path_2, header=True, inferSchema=True)
ps_df = spark.read.csv(ps_path, header=True, inferSchema=True)

# Start time
start_time = time.time()

# Create crime dataframe combining 2 crime dataframes
crime_df = crime_df_1.union(crime_df_2)

# We filter crime dataframe with victims of positive age
# we create a key attribute called crime_id
# we calculate the crime_location using crime coordinates
# we select the crime_id and the crime_location
crime_df = crime_df.filter(col("Vict Age") > 0) \
   .withColumn("Crime_Location", ST_Point("LAT", "LON")) \
   .withColumn("Crime_id", monotonically_increasing_id()) \
   .select("Crime_id", "Crime_Location")

# we calculate the ps_location using the police station coordinates
# we select the division and the ps_location
# since the division attribute can be used as a key attribute
ps_df = ps_df.withColumn("PS_Location", ST_Point("X", "Y")) \
             .select("DIVISION", "PS_Location")

# we make a new dataframe called df
# which is the crossjoin of crime_df and ps_df
# to find all possible combination between both dataframes
# and we calculate the distance between each location
df = crime_df.join(ps_df) \
             .withColumn("Distance_km", ST_DistanceSphere("Crime_Location", "PS_Location")/1000) \
             .withColumnRenamed("DIVISION", "division")

# we calculate for each crime the minimum distance
# to find the closest police station for each crime
# and we filter the df such that
# for each crime we keep the closest police station
# and the distance between them
windowSpec = Window.partitionBy("Crime_id").orderBy("Distance_km")
df = df.withColumn("num_row", row_number().over(windowSpec))

df = df.filter(col("num_row") == 1) \
       .select("Crime_id", "division", "Distance_km")

# then we broupby the division attribute
# and calculate the count and the average distance such that
# for each police station we keep how many crimes occured closer
# to that police station and the average distance between them
df = df.groupBy("division") \
       .agg(count("*").alias("#"),
            avg("Distance_km").alias("average_distance"))

# finally we select the division, average_distance and the count '#'
# sorted by '#'
result_df = df.select("division", "average_distance", "#").sort("#", ascending=False)

result_df.show(truncate=False)
result_df.printSchema()

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------------------+-------+
|division  |average_distance  |#      |
+----------+------------------+-------+
|HARBOR    |10896.762642401442|2476518|
|HOLLENBECK|12571.896424760367|2579   |
+----------+------------------+-------+

root
 |-- division: string (nullable = true)
 |-- average_distance: double (nullable = true)
 |-- #: long (nullable = false)

Time taken: 24.62 seconds

In [5]:
# Create spark Session
spark = SparkSession \
    .builder \
    .appName("Query 5 with 2 core/4 Gb memory") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)

# Read crime and police stations dataframes
crime_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
ps_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

crime_df_1 = spark.read.csv(crime_path_1, header=True, inferSchema=True)
crime_df_2 = spark.read.csv(crime_path_2, header=True, inferSchema=True)
ps_df = spark.read.csv(ps_path, header=True, inferSchema=True)

start_time = time.time()
crime_df = crime_df_1.union(crime_df_2)

crime_df = crime_df.filter(col("Vict Age") > 0) \
   .withColumn("Crime_Location", ST_Point("LAT", "LON")) \
   .withColumn("Crime_id", monotonically_increasing_id()) \
   .select("Crime_id", "Crime_Location")

ps_df = ps_df.withColumn("PS_Location", ST_Point("X", "Y")) \
             .select("DIVISION", "PS_Location")

df = crime_df.join(ps_df) \
             .withColumn("Distance_km", ST_DistanceSphere("Crime_Location", "PS_Location")/1000) \
             .withColumnRenamed("DIVISION", "division")

windowSpec = Window.partitionBy("Crime_id").orderBy("Distance_km")
df = df.withColumn("num_row", row_number().over(windowSpec))

df = df.filter(col("num_row") == 1) \
       .select("Crime_id", "division", "Distance_km")

df = df.groupBy("division") \
       .agg(count("*").alias("#"),
            avg("Distance_km").alias("average_distance"))

result_df = df.select("division", "average_distance", "#").sort("#", ascending=False)

result_df.show(truncate=False)
result_df.printSchema()

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------------------+-------+
|division  |average_distance  |#      |
+----------+------------------+-------+
|HARBOR    |10896.762642401449|2476518|
|HOLLENBECK|12571.896424760362|2579   |
+----------+------------------+-------+

root
 |-- division: string (nullable = true)
 |-- average_distance: double (nullable = true)
 |-- #: long (nullable = false)

Time taken: 17.85 seconds

In [6]:
# Create spark Session
spark = SparkSession \
    .builder \
    .appName("Query 5 with 2 core/4 Gb memory") \
    .config("spark.executor.instances", "8") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)

# Read crime and police stations dataframes
crime_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
ps_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

crime_df_1 = spark.read.csv(crime_path_1, header=True, inferSchema=True)
crime_df_2 = spark.read.csv(crime_path_2, header=True, inferSchema=True)
ps_df = spark.read.csv(ps_path, header=True, inferSchema=True)

start_time = time.time()
crime_df = crime_df_1.union(crime_df_2)

crime_df = crime_df.filter(col("Vict Age") > 0) \
   .withColumn("Crime_Location", ST_Point("LAT", "LON")) \
   .withColumn("Crime_id", monotonically_increasing_id()) \
   .select("Crime_id", "Crime_Location")

ps_df = ps_df.withColumn("PS_Location", ST_Point("X", "Y")) \
             .select("DIVISION", "PS_Location")

df = crime_df.join(ps_df) \
             .withColumn("Distance_km", ST_DistanceSphere("Crime_Location", "PS_Location")/1000) \
             .withColumnRenamed("DIVISION", "division")

windowSpec = Window.partitionBy("Crime_id").orderBy("Distance_km")
df = df.withColumn("num_row", row_number().over(windowSpec))

df = df.filter(col("num_row") == 1) \
       .select("Crime_id", "division", "Distance_km")

df = df.groupBy("division") \
       .agg(count("*").alias("#"),
            avg("Distance_km").alias("average_distance"))

result_df = df.select("division", "average_distance", "#").sort("#", ascending=False)

result_df.show(truncate=False)
result_df.printSchema()

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------------------+-------+
|division  |average_distance  |#      |
+----------+------------------+-------+
|HARBOR    |10896.76264240145 |2476518|
|HOLLENBECK|12571.896424760362|2579   |
+----------+------------------+-------+

root
 |-- division: string (nullable = true)
 |-- average_distance: double (nullable = true)
 |-- #: long (nullable = false)

Time taken: 6.99 seconds