# Query 4
## DataFrame API

In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1805,application_1765289937462_1789,pyspark,idle,Link,Link,,
1807,application_1765289937462_1791,pyspark,starting,,,,


In [2]:
# We initialized a spark session with specific configurations, now we import
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col, row_number, avg, round, count
from sedona.spark import *
from pyspark.sql.window import Window
import time

sedona = SedonaContext.create(spark)

#Beginning of timing
start_time = time.time()

# Define schema for crime data DataFrame
crime_data_full_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType())
])

# Create DataFrame from main dataset
crime_data_full_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/", \
    header=True, \
    schema=crime_data_full_schema) 

# Filter out nulls and Null Island, shorten the Dataframe and add coordinate points
crime_geo_df = crime_data_full_df.select("DR_NO", "LAT", "LON") \
    .filter(col("LAT").isNotNull() & col("LON").isNotNull()) \
    .filter(~((col("LAT") == 0) & (col("LON") == 0))) \
    .withColumn("crime_geom", ST_Point("LON", "LAT"))

# Schema and DataFrame of police stations dataset
stations_schema = StructType([
    StructField("X", DoubleType()),       # Longitude
    StructField("Y", DoubleType()),       # Latitude
    StructField("FID", IntegerType()),
    StructField("DIVISION", StringType()),
    StructField("LOCATION", StringType()),
    StructField("PREC", IntegerType()),
])

stations_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv", 
    header=True, 
    schema=stations_schema) \
    .select(col("DIVISION").alias("division"), 
            col("Y").alias("station_lat"),  
            col("X").alias("station_lon")) \
    .withColumn("station_geom", ST_Point("station_lon", "station_lat"))

# Join the 2 dataframes
cross_joined_df = crime_geo_df.crossJoin(stations_df)

# Calculate the distance between crimes and police stations
distance_df = cross_joined_df.withColumn("distance", ST_DistanceSphere("crime_geom", "station_geom")/1000) \
    .select("DR_NO", "division", "distance")

# Find the closest station to each crime
window_crime = Window.partitionBy("DR_NO").orderBy(col("distance").asc())

closest_station_df = distance_df.withColumn("min_rank", row_number().over(window_crime)
).filter(col("min_rank") == 1)

# Find average distance and # of closest crimes for each station 
final_result_df = closest_station_df.groupBy("division").agg(
    round(avg(col("distance")), 3).alias("average_distance"),
    count(col("DR_NO")).alias("#")
).orderBy(col("#").desc())

# Show results
final_result_df.show(21)

# End of timing
execution_time = time.time() - start_time
print(f"\nExecution time: {execution_time} seconds")

# Show plan
final_result_df.explain()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1808,application_1765289937462_1792,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+----------------+------+
|        division|average_distance|     #|
+----------------+----------------+------+
|       HOLLYWOOD|           2.074|224124|
|        VAN NUYS|           2.939|208129|
|       SOUTHWEST|           2.191|189119|
|        WILSHIRE|           2.593|186383|
|     77TH STREET|           1.717|170620|
| NORTH HOLLYWOOD|           2.642|168096|
|         OLYMPIC|           1.729|162805|
|         PACIFIC|           3.853|162027|
|         CENTRAL|           0.993|154689|
|         RAMPART|           1.534|153204|
|       SOUTHEAST|           2.444|143803|
|     WEST VALLEY|           3.022|136622|
|        FOOTHILL|            4.26|132482|
|         TOPANGA|           3.297|131054|
|          HARBOR|           3.702|127071|
|      HOLLENBECK|           2.677|116235|
|WEST LOS ANGELES|            2.79|115969|
|          NEWTON|           1.635|111392|
|       NORTHEAST|           3.623|108243|
|         MISSION|           3.676| 97926|
|      DEVO

## Other configurations

In [None]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2"
    }
}

In [25]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1699,application_1765289937462_1683,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1693,application_1765289937462_1677,pyspark,idle,Link,Link,,
1695,application_1765289937462_1679,pyspark,idle,Link,Link,,
1699,application_1765289937462_1683,pyspark,idle,Link,Link,,✔
1701,,pyspark,starting,,,,
