In [7]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3669,application_1732639283265_3615,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3608,application_1732639283265_3564,pyspark,idle,Link,Link,,
3612,application_1732639283265_3568,pyspark,idle,Link,Link,,
3619,application_1732639283265_3575,pyspark,idle,Link,Link,,
3624,application_1732639283265_3580,pyspark,idle,Link,Link,,
3630,application_1732639283265_3586,pyspark,idle,Link,Link,,
3634,application_1732639283265_3590,pyspark,idle,Link,Link,,
3637,application_1732639283265_3593,pyspark,idle,Link,Link,,
3641,application_1732639283265_3597,pyspark,idle,Link,Link,,
3658,application_1732639283265_3604,pyspark,idle,Link,Link,,
3659,application_1732639283265_3605,pyspark,idle,Link,Link,,


In [8]:
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format,row_number, to_timestamp, round, format_number, concat, lit ,when, col, count , sum ,regexp_replace ,trim,udf,avg
from pyspark.sql.types import FloatType,IntegerType,StringType
from pyspark.sql.window import Window

# To log our application's execution time:
import time
spark = SparkSession.builder.appName("Query 5 DF").getOrCreate()
spark.catalog.clearCache()
conf = spark.sparkContext.getConf()
sedona = SedonaContext.create(spark)


results = []
# Start timing
conf = spark.sparkContext.getConf()
# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

start_time = time.time()
#crime data
crime_data_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_*.csv", 
                               header=True, 
                               inferSchema=True)
# create an ST point for each crime. Filter out points (0,0) null island
crime_point = crime_data_df.filter(((col("LON")!=0) | (col("LAT")!=0))).withColumn("point_crime", ST_Point("LON", "LAT")).select("DR_NO","point_crime")
print("Number of crimes in the start: ", crime_point.count()) 
# police stations
police_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"
police_df = spark.read.csv(police_path, header=True, inferSchema=True)
# set an ST point for each police station
police_df_with_point = police_df.withColumn("point_police", ST_Point("X", "Y")).select("point_police","Division")
#perform crossjoin between station and crimes
police_and_crimes = police_df_with_point.crossJoin(crime_point) \
                    .withColumn("distance_km", ST_DistanceSphere("point_crime", "point_police")/1000) \
                    .drop("point_police")
# rank pairs of crime_id and police station based on their distance
# keep only the pairs with rank==1, meaning for each crime keep only its nearest police station
window_spec = Window.partitionBy("DR_NO").orderBy("distance_km")
crime_with_nearest_police = police_and_crimes.withColumn("rank", row_number().over(window_spec)) \
                            .filter(col("rank") == 1) \
                            .select("Division", "distance_km")
final_result = crime_with_nearest_police.groupBy("Division") \
               .agg(format_number(avg("distance_km"),4).alias("average_distance_km"),count("*").alias("crime_count"))
print("Number of crimes in the end: ", final_result.agg(sum("crime_count")).collect()[0][0])
final_result.orderBy(col("crime_count").desc()).show()
end_time = time.time()
exec_time = end_time - start_time
results.append({
    "Executor instances": conf.get("spark.executor.instances"),
    "cores": conf.get("spark.executor.cores"),
    "memory": conf.get("spark.executor.memory"),
    "time": exec_time
})
# Assuming `results` is a list of dictionaries (if you're not using a DataFrame)
results_df = spark.createDataFrame(results)

# Save the DataFrame to S3 as JSON
results_df.write.mode("append").json("s3://groups-bucket-dblab-905418150721/group12/q5_results.json")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 8g
Executor Cores: 4
Number of crimes in the start:  3109880
Number of crimes in the end:  3109880
+----------------+-------------------+-----------+
|        Division|average_distance_km|crime_count|
+----------------+-------------------+-----------+
|       HOLLYWOOD|             2.0763|     224340|
|        VAN NUYS|             2.9534|     210134|
|       SOUTHWEST|             2.1914|     188901|
|        WILSHIRE|             2.5927|     185996|
|     77TH STREET|             1.7165|     171827|
|         OLYMPIC|             1.7236|     170897|
| NORTH HOLLYWOOD|             2.6430|     167854|
|         PACIFIC|             3.8501|     161359|
|         CENTRAL|             0.9925|     153871|
|         RAMPART|             1.5345|     152736|
|       SOUTHEAST|             2.4219|     152176|
|     WEST VALLEY|             3.0357|     138643|
|         TOPANGA|             3.2970|     138217|
|        FOOTHILL|             4.2509|     134

In [9]:
log_path = f"s3://groups-bucket-dblab-905418150721/group12/q5_results.json"
logs=spark.read.json(log_path)
logs.select("Executor instances", "cores", "memory", "time") \
    .orderBy(col("time").asc()) \
    .show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+-----+------+------------------+
|Executor instances|cores|memory|              time|
+------------------+-----+------+------------------+
|                 8|    1|    2g| 48.10812711715698|
|                 2|    4|    8g| 96.63097214698792|
|                 4|    2|    4g|109.10038208961487|
+------------------+-----+------+------------------+