# Query 5

## Import and describe data

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time

EXECUTORS = "8"
CORES = "1"
MEMORY = "2g"

# Create SparkSession
spark = SparkSession.builder \
    .appName("Query5") \
    .config("spark.executor.instances", EXECUTORS) \
    .config("spark.executor.cores", CORES) \
    .config("spark.executor.memory", MEMORY) \
    .getOrCreate()

# Load and filter Crime Data
crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
crime_data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True) \
    .withColumnRenamed('AREA ', 'AREA')
crime_data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)
crime_data = (
    crime_data_1.union(crime_data_2)
    # Drop null island (0,0) entries
    .filter((col('LON') != 0) | (col('LAT') != 0))
    # Keep only selected columns
#     .select('DR_NO', 'DATE OCC', 'AREA', 'AREA NAME', 'Status', 'Status Desc')
)

# Load LA Police Stations data
laps_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"
laps_data = spark.read.csv(laps_data_path , header=True, inferSchema=True)

In [None]:
print("Crime Data:")
crime_data.printSchema()

print("\nLA Police Stations Data:")
laps_data.printSchema()

laps_data.show()

## Query

In [None]:
from sedona.register.geo_registrator import SedonaRegistrator
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window

def runQuery():
    # Register Sedona
    SedonaRegistrator.registerAll(spark)

    # Start timing
    start_time = time.time()

    # Load and filter Crime Data
    crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
    crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
    crime_data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True) \
        .withColumnRenamed('AREA ', 'AREA')
    crime_data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)
    crime_data = (
        crime_data_1.union(crime_data_2)
        .select('DR_NO','LAT', 'LON')
        .filter((col('LON') != 0) | (col('LAT') != 0))
        .filter(col('LAT').isNotNull() & col('LON').isNotNull())
        .withColumn('LAT', col('LAT').cast(DoubleType()))
        .withColumn('LON', col('LON').cast(DoubleType())) 
    )

    # Load and filter LA Police Stations data
    laps_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"
    laps_data = (
        spark.read.csv(laps_data_path , header=True, inferSchema=True)
        .select("FID", "X", "Y", "DIVISION")
        .withColumn("LAT", col("Y").cast(DoubleType()))
        .withColumn("LON", col("X").cast(DoubleType()))
    )

    # Register spatial DataFrames
    crime_data = crime_data.withColumn("geometry", expr("ST_Point(LON, LAT)"))
    laps_data = laps_data.withColumn("geometry", expr("ST_Point(LON, LAT)"))

    # Perform a cross join to calculate distances between all crime points and police stations
    crossed = crime_data.alias("c").join(
        laps_data.alias("l"),
        how="cross"
    ).withColumn(
        "distance",
        expr("ST_Distance(c.geometry, l.geometry)") * 111
    )

    # Find the closest station for each crime
    crime_with_closest_station = crossed.withColumn(
        "min_distance",
        min("distance").over(Window.partitionBy("c.LAT", "c.LON"))
    ).filter(col("distance") == col("min_distance"))

    # Group results
    result = crime_with_closest_station.groupBy("l.DIVISION").agg(
        count("*").alias("crime_count"),
        avg("distance").alias("mean_distance")
    ) \
    .orderBy(col("crime_count").desc()) \
    .withColumn(
        "mean_distance",
        round("mean_distance", 3)
    )

    # Show the result
    result.show()

    # Stop timing and print out the execution duration
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken: {elapsed_time:.2f} seconds")

    # Save Results
    output_path = "s3://groups-bucket-dblab-905418150721/group7/q5_results"
    result.write \
        .option("header", True) \
        .mode("overwrite") \
        .csv(f"{output_path}/results")
    
    return elapsed_time

In [None]:
proc_time = 0
num_exp = 10
for i in range(num_exp):
    print("Working on experiment no ", i)
    proc_time += runQuery()/num_exp
    
print(f"Average Processing Time: {proc_time:.2f}")