# DataFrame - Query5 - 8 executors × 1 core/2 GB memory

## Import Data from csv files

In [1]:
from sedona.spark import *
from pyspark.sql.functions import col, lower, when
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as _sum
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

start_time = time.time()

spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .config("spark.executor.instances", "8") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3492,application_1732639283265_3448,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
#load data from crimes
data_path = 's3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv'

df = spark.read.csv(data_path, header=True, inferSchema=True)

data2_path = 's3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv'
df2 = spark.read.csv(data2_path,header = True, inferSchema = True)

df_combined = df.union(df2)
df_combined.columns
df_combined.count()
df_combined.printSchema()



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer (nullable = true)
 |-- 

In [3]:
#load data from LA income
data3_path = 's3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv'
df3 = spark.read.csv(data3_path,header = True, inferSchema = True)
df3.columns
df3.count()
df3.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- FID: integer (nullable = true)
 |-- DIVISION: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- PREC: integer (nullable = true)

In [4]:
df3 = df3.withColumn("geom_police", ST_Point("X", "Y"))
df_combined = df_combined.withColumn("geom_crimes", ST_Point("LON", "LAT"))

normalized_df3 = df3.withColumn("area_name", lower(col("DIVISION")))
normalized_df_combined = df_combined.withColumn("division", lower(col("AREA NAME")))

normalized_df3=normalized_df3.select("area_name","geom_police")
normalized_df_combined=normalized_df_combined.select("division","geom_crimes")

normalized_df_combined = normalized_df_combined.withColumn(
    "division",
    when(col("division") == "n hollywood", "north hollywood")
    .when(col("division") == "west la", "west los angeles")
    .otherwise(col("division"))
)
# Ενσωμάτωση της θέσης του αστυνομικού τμήματος στο normalized_df_combined
normalized_df_combined = normalized_df_combined.join(
    normalized_df3,
    normalized_df_combined["division"] == normalized_df3["area_name"],
    "left"
).drop("area_name")

normalized_df_combined=normalized_df_combined.withColumn("distance",ST_DistanceSphere("geom_police", "geom_crimes"))

normalized_df_combined = normalized_df_combined.groupBy("division").agg(
    F.avg("distance").alias("average_distance"),
    F.count("geom_crimes").alias("#")
)

normalized_df_combined=normalized_df_combined.select("division","average_distance","#")
normalized_df_combined = normalized_df_combined.orderBy(col("#").desc())

normalized_df_combined.show(truncate=False,n=25)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+------+
|division        |average_distance  |#     |
+----------------+------------------+------+
|77th street     |14667.404665088141|206981|
|southwest       |11919.183392928362|192367|
|pacific         |23217.83789733331 |171166|
|central         |19800.237298414475|166946|
|north hollywood |16209.757573536048|164710|
|southeast       |18117.102515811494|161256|
|hollywood       |34031.54247813018 |151053|
|newton          |12954.564101781052|148886|
|olympic         |16840.905966885024|145135|
|mission         |20202.014726011774|143777|
|northeast       |12730.983942391815|142833|
|van nuys        |14156.993167365055|142327|
|topanga         |9801.35072273564  |138708|
|devonshire      |18727.924888877005|138044|
|wilshire        |18742.94184561197 |136374|
|rampart         |16044.94987524845 |136104|
|west los angeles|13881.883428494077|134369|
|harbor          |15430.634370288282|133031|
|west valley     |11522.972959349754|131585|
|hollenbec