## Time Measurement

In [16]:
# To log our application's execution time:
import time

# decorator to calculate duration
# taken by any function.
def measure_time(func):
    # added arguments inside the inner1,
    # if function takes any arguments,
    # can be added like this.
    def inner1(*args, **kwargs):

        # Start timing
        start_time = time.time()

        func(*args, **kwargs)

        # Stop timing and print out the execution duration
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Time taken by {func.__name__}: {elapsed_time:.2f} seconds")

    return inner1

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Query 1

In [15]:
crime_data_19 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
crime_data_20 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", header=True, inferSchema=True)
crime_data = crime_data_19.union(crime_data_20)

An error was encountered:
Invalid status code '404' from http://ec2-35-159-120-182.eu-central-1.compute.amazonaws.com:8998/sessions/2206 with error payload: {"msg":"Session '2206' not found."}


In [87]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Query 1") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [90]:
## DataFrame APIs ##

from pyspark.sql.functions import col, when

filtered_data = crime_data.filter(col("Crm Cd Desc").like("%AGGRAVATED ASSAULT%"))

start_time_df = time.time()
updated_df = filtered_data.withColumn(
    "Age_Group",
    when(col("Vict Age") < 18, "Children")
    .when((col("Vict Age") >= 18) & (col("Vict Age") < 25), "Young Adults")
    .when((col("Vict Age") >= 25) & (col("Vict Age") < 65), "Adults")
    .otherwise("Seniors")
)

age_group_counts = updated_df.groupBy("Age_Group").count()
age_group_counts = age_group_counts.orderBy(col("count").desc())
age_group_counts.show()
end_time_df = time.time()
print("DataFrame API execution time: ", end_time_df - start_time_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|   Age_Group| count|
+------------+------+
|      Adults|121093|
|Young Adults| 33605|
|    Children| 15928|
|     Seniors|  5985|
+------------+------+

DataFrame API execution time:  12.480517148971558

In [91]:
crime_rdd = filtered_data.rdd

start_time_rdd = time.time()

def get_age_group(age):
    if age < 18:
        return "Children"
    elif 18 <= age < 25:
        return "Young Adults"
    elif 25 <= age < 65:
        return "Adults"
    else:
        return "Seniors"

age_group_rdd = crime_rdd.map(lambda x: (get_age_group(x['Vict Age']), 1))
age_group_count = age_group_rdd.reduceByKey(lambda a, b: a + b)

sorted_age_group_count = age_group_count.sortBy(lambda x: x[1], ascending=False)
for age_group, count in sorted_age_group_count.collect():
    print(age_group, "->", count)
end_time_rdd = time.time()
print("RDD API execution time: ", end_time_rdd - start_time_rdd)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Adults -> 121093
Young Adults -> 33605
Children -> 15928
Seniors -> 5985
RDD API execution time:  7.831646680831909

# Query 2

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, BooleanType
from pyspark.sql.functions import col, udf, sum, max, min, avg, count, mean, when, monotonically_increasing_id, dense_rank, window
from pyspark.sql.window import Window

spark = SparkSession \
    .builder \
    .appName("Query 2: 3 Police Stations for each year with biggest rate of closed cases") \
    .getOrCreate() 


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
crimes_2010_19_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
crimes_2020_24_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", header=True)
crimes_df = crimes_2010_19_df.union(crimes_2020_24_df)
crimes_df.printSchema()
print("Number of Rows (Crime DataFrame)")
crimes_df.count()
# print('Crime data')
# crimes_df.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- DR_NO: string (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: string (nullable = true)
 |-- AREA : string (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: string (nullable = true)
 |-- Part 1-2: string (nullable = true)
 |-- Crm Cd: string (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: string (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: string (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: string (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: string (nullable = true)
 |-- Crm Cd 2: string (nullable = true)
 |-- Crm Cd 3: string (nullable = true)
 |-- Crm Cd 4: string (nullable = true)
 |-- LOCATION: str

In [4]:
# UDF - User Defined Functions definitions

def extract_year(date_occ: str) -> str:
    '''returns year from DATE OCC column'''
    return date_occ.split("/")[2].split(" ")[0]

def is_closed_case(case: str) -> int:
    '''returns 1 if an incident is a closed case in police department based on Status Desc else returns 0'''
    return 0 if (case=='Invest Cont' or case=='UNK') else 1

def percentage(closed: int, total: int) -> float:
    return (closed/total)*100

# print(is_closed_case('c'))
# print(extract_year("01/01/2010 12:00:..."))
extract_year_udf = udf(extract_year, StringType())
is_closed_case_udf = udf(is_closed_case, IntegerType())
percentage_udf = udf(percentage, FloatType())

# register functions for SQL
spark.udf.register("extract_year", extract_year)
spark.udf.register("is_closed_case", is_closed_case)
spark.udf.register("percentage", percentage)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<function percentage at 0x7fb059b661f0>

In [5]:
# s3://groups-bucket-dblab-905418150721/group46/query2/
# s3://groups-bucket-dblab-905418150721/group46/query2-single-parquet/
# Write results to S3 -> 
#    1. create the output directory in your S3 bucket
#    2. change your group number below 
#    3. and uncomment
group_number = "46"
s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/query2-single-parquet/"
# Repartition the DataFrame to a single partition so that it will possible be written in one parquet file
single_partition_df = crimes_df.repartition(1) 
single_partition_df.write.mode("overwrite").parquet(s3_path)
crimes_df_from_parquet = spark.read.parquet(s3_path)
# crimes_df_from_parquet.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Query 2 - DataFrame API

In [6]:
@measure_time
def query2_dataframe(df, debug = False):
    '''Returns the DF that is result of the query 2 using DataFrame API'''

    modified_df = df.select("DR_NO", "DATE OCC","AREA NAME", "Status Desc") \
        .withColumn("year", extract_year_udf(col("DATE OCC"))) \
        .withColumn("precinct", col("AREA NAME")) \
        .withColumn("is_closed_case", is_closed_case_udf(col("Status Desc")))
    if debug: modified_df.show(3)

    grouped_df = modified_df.groupBy("year", "precinct") \
        .agg( \
             count("*").alias("total_cases"), \
             sum("is_closed_case").alias("closed_cases"), \
             percentage_udf(col("closed_cases"), col("total_cases")).alias("closed_case_rate") \
            )
    if debug: grouped_df.show(3)

    # Define a window and make partitions by year in order to assign specific rank values later to the rows
    windowSpec = Window.partitionBy("year").orderBy(col("closed_case_rate").desc())
    ranked_df = grouped_df.withColumn("#", dense_rank().over(windowSpec))
    if debug: ranked_df.show(3)

    # Project specific columns, Select Top 3 for each year and sort in ascending order for year and rank (#)
    result = ranked_df.select("year", "precinct","closed_case_rate", "#") \
        .filter(col("#") <= 3) \
        .orderBy(["year", "#"], ascending=[True,True])

    result.show(50)



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Comparison of query execution time with input from CSV or Parquet

In [7]:
@measure_time
def query2_csv(execute_query = True):
    crimes_2010_19_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
    crimes_2020_24_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", header=True)
    crimes_df = crimes_2010_19_df.union(crimes_2020_24_df)
    # crimes_df.show(3)
    if execute_query: query2_dataframe(crimes_df)

@measure_time
def query2_parquet(execute_query = True):
    crimes_df_from_parquet = spark.read.parquet(s3_path)
    # crimes_df_from_parquet.show(3)
    if execute_query: query2_dataframe(crimes_df_from_parquet)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# Call the function that loads DF from CSV and then executes the query 
query2_csv()
# Call the function that loads DF from parquet and then executes the query 
query2_parquet()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+----------------+---+
|year|   precinct|closed_case_rate|  #|
+----+-----------+----------------+---+
|2010|    Rampart|       32.847134|  1|
|2010|    Olympic|        31.51529|  2|
|2010|     Harbor|       29.360283|  3|
|2011|    Olympic|       35.040062|  1|
|2011|    Rampart|        32.49645|  2|
|2011|     Harbor|       28.513363|  3|
|2012|    Olympic|       34.297085|  1|
|2012|    Rampart|       32.460003|  2|
|2012|     Harbor|       29.509586|  3|
|2013|    Olympic|        33.58218|  1|
|2013|    Rampart|       32.106037|  2|
|2013|     Harbor|       29.723639|  3|
|2014|   Van Nuys|       32.021523|  1|
|2014|West Valley|       31.497547|  2|
|2014|    Mission|        31.22494|  3|
|2015|   Van Nuys|        32.26514|  1|
|2015|    Mission|       30.463762|  2|
|2015|   Foothill|       30.353003|  3|
|2016|   Van Nuys|        32.19452|  1|
|2016|West Valley|       31.401464|  2|
|2016|   Foothill|       29.908648|  3|
|2017|   Van Nuys|       32.055428|  1|


## Query 2 - SQL API

In [9]:
@measure_time
def query2_sql(df, debug = False):
    '''Returns the DF that is result of the query 2 using SQL API'''

    df.createOrReplaceTempView("crimes")

    query = """
        SELECT `DR_NO`, `DATE OCC`, `AREA NAME`, `Status Desc`, extract_year(`DATE OCC`) as year, `AREA NAME` as precinct, is_closed_case(`Status Desc`) as is_closed_case 
        FROM crimes
    """

    modified_crimes = spark.sql(query)
    if debug: modified_crimes.show(3)
    modified_crimes.createOrReplaceTempView("modified_crimes")

    query = """
        SELECT year, precinct, count(*) as total_cases, CAST(sum(is_closed_case) AS INT) as closed_cases, CAST(percentage(`closed_cases`,`total_cases`) AS DECIMAL(10,6)) as closed_case_rate
        FROM modified_crimes
        GROUP BY year, precinct
    """

    grouped_crimes = spark.sql(query)
    if debug: grouped_crimes.show(3)
    grouped_crimes.createOrReplaceTempView("grouped_crimes")

    query = """
        SELECT year, precinct,  total_cases, closed_cases, closed_case_rate, DENSE_RANK() OVER(PARTITION BY year ORDER BY closed_case_rate DESC) as `#`
        FROM grouped_crimes
        ORDER BY year, closed_case_rate DESC
    """

    ranked_crimes = spark.sql(query)
    if debug: ranked_crimes.show(3)
    ranked_crimes.createOrReplaceTempView("ranked_crimes")

    query = """
        SELECT year, precinct, closed_case_rate, `#`
        FROM ranked_crimes
        WHERE `#` <= 3
        ORDER BY year, `#`
    """

    result = spark.sql(query)
    result.show(50)
    result.createOrReplaceTempView("result")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Comparison of query execution time using DataFrame or SQL API

In [10]:
# Execute the Query 2 with the DF API
query2_dataframe(crimes_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+----------------+---+
|year|   precinct|closed_case_rate|  #|
+----+-----------+----------------+---+
|2010|    Rampart|       32.847134|  1|
|2010|    Olympic|        31.51529|  2|
|2010|     Harbor|       29.360283|  3|
|2011|    Olympic|       35.040062|  1|
|2011|    Rampart|        32.49645|  2|
|2011|     Harbor|       28.513363|  3|
|2012|    Olympic|       34.297085|  1|
|2012|    Rampart|       32.460003|  2|
|2012|     Harbor|       29.509586|  3|
|2013|    Olympic|        33.58218|  1|
|2013|    Rampart|       32.106037|  2|
|2013|     Harbor|       29.723639|  3|
|2014|   Van Nuys|       32.021523|  1|
|2014|West Valley|       31.497547|  2|
|2014|    Mission|        31.22494|  3|
|2015|   Van Nuys|        32.26514|  1|
|2015|    Mission|       30.463762|  2|
|2015|   Foothill|       30.353003|  3|
|2016|   Van Nuys|        32.19452|  1|
|2016|West Valley|       31.401464|  2|
|2016|   Foothill|       29.908648|  3|
|2017|   Van Nuys|       32.055428|  1|


In [11]:
# Execute Query 2 with the SQL API
query2_sql(crimes_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+----------------+---+
|year|   precinct|closed_case_rate|  #|
+----+-----------+----------------+---+
|2010|    Rampart|       32.847134|  1|
|2010|    Olympic|       31.515290|  2|
|2010|     Harbor|       29.360283|  3|
|2011|    Olympic|       35.040060|  1|
|2011|    Rampart|       32.496447|  2|
|2011|     Harbor|       28.513362|  3|
|2012|    Olympic|       34.297085|  1|
|2012|    Rampart|       32.460005|  2|
|2012|     Harbor|       29.509586|  3|
|2013|    Olympic|       33.582179|  1|
|2013|    Rampart|       32.106038|  2|
|2013|     Harbor|       29.723639|  3|
|2014|   Van Nuys|       32.021524|  1|
|2014|West Valley|       31.497548|  2|
|2014|    Mission|       31.224940|  3|
|2015|   Van Nuys|       32.265141|  1|
|2015|    Mission|       30.463763|  2|
|2015|   Foothill|       30.353002|  3|
|2016|   Van Nuys|       32.194518|  1|
|2016|West Valley|       31.401464|  2|
|2016|   Foothill|       29.908647|  3|
|2017|   Van Nuys|       32.055427|  1|


# Query 3

In [1]:
from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

# Create spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")
# Print schema
flattened_df.printSchema()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2265,application_1732639283265_2225,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG10: string (nullable = true)
 |-- BG10FIP10: string (nullable = true)
 |-- BG12: string (nullable = true)
 |-- CB10: string (nullable = true)
 |-- CEN_FIP13: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOM: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- CT10: string (nullable = true)
 |-- CT12: string (nullable = true)
 |-- CTCB10: string (nullable = true)
 |-- HD_2012: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING10: long (nullable = true)
 |-- LA_FIP10: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP_2010: long (nullable = true)
 |-- PUMA10: string (nullable = true)
 |-- SPA_2012: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP_DIST: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: double (nullable = true)
 |-- ShapeSTLength: double (nullable = true)
 |-- ZCTA10: string (nullable = true)
 |-- geometry: geometry (nulla

In [2]:
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
census_data = spark.read.json("s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson")
income_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

flat_census_data = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")
# Print schema
flat_census_data.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG10: string (nullable = true)
 |-- BG10FIP10: string (nullable = true)
 |-- BG12: string (nullable = true)
 |-- CB10: string (nullable = true)
 |-- CEN_FIP13: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOM: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- CT10: string (nullable = true)
 |-- CT12: string (nullable = true)
 |-- CTCB10: string (nullable = true)
 |-- HD_2012: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING10: long (nullable = true)
 |-- LA_FIP10: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP_2010: long (nullable = true)
 |-- PUMA10: string (nullable = true)
 |-- SPA_2012: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP_DIST: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: double (nullable = true)
 |-- ShapeSTLength: double (nullable = true)
 |-- ZCTA10: string (nullable = true)
 |-- geometry: geometry (nulla

In [5]:
flat_census_data.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

109279

In [4]:
LA_areas = flat_census_data.filter(col("CITY") == "Los Angeles")
LA_areas.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

30637

In [5]:
from pyspark.sql.functions import col, sum, avg

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
joined_df = LA_areas.join(income_data, LA_areas["ZCTA10"] == income_data["Zip Code"], "inner")
joined_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG10: string (nullable = true)
 |-- BG10FIP10: string (nullable = true)
 |-- BG12: string (nullable = true)
 |-- CB10: string (nullable = true)
 |-- CEN_FIP13: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOM: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- CT10: string (nullable = true)
 |-- CT12: string (nullable = true)
 |-- CTCB10: string (nullable = true)
 |-- HD_2012: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING10: long (nullable = true)
 |-- LA_FIP10: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP_2010: long (nullable = true)
 |-- PUMA10: string (nullable = true)
 |-- SPA_2012: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP_DIST: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: double (nullable = true)
 |-- ShapeSTLength: double (nullable = true)
 |-- ZCTA10: string (nullable = true)
 |-- geometry: geometry (nulla

In [7]:
joined_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG10: string (nullable = true)
 |-- BG10FIP10: string (nullable = true)
 |-- BG12: string (nullable = true)
 |-- CB10: string (nullable = true)
 |-- CEN_FIP13: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOM: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- CT10: string (nullable = true)
 |-- CT12: string (nullable = true)
 |-- CTCB10: string (nullable = true)
 |-- HD_2012: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING10: long (nullable = true)
 |-- LA_FIP10: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP_2010: long (nullable = true)
 |-- PUMA10: string (nullable = true)
 |-- SPA_2012: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP_DIST: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: double (nullable = true)
 |-- ShapeSTLength: double (nullable = true)
 |-- ZCTA10: string (nullable = true)
 |-- geometry: geometry (nulla

In [22]:
joined_df.select("Estimated Median Income").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------+
|Estimated Median Income|
+-----------------------+
|                $84,679|
|                $50,879|
|                $50,879|
|                $50,879|
|                $50,879|
|                $84,679|
|                $50,879|
|                $50,879|
|                $50,879|
|                $50,879|
|                $50,879|
|                $50,879|
|                $50,879|
|                $41,569|
|                $41,569|
|                $41,569|
|                $41,569|
|                $41,569|
|                $38,330|
|                $50,879|
+-----------------------+
only showing top 20 rows

In [8]:
from pyspark.sql.functions import regexp_replace

joined_df = joined_df.withColumn(
    "Estimated Median Income",
    regexp_replace(col("Estimated Median Income"), "[\\$,]", ""))
    
joined_df = joined_df.withColumn(
    "Estimated Median Income",
    col("Estimated Median Income").cast("double")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
joined_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG10: string (nullable = true)
 |-- BG10FIP10: string (nullable = true)
 |-- BG12: string (nullable = true)
 |-- CB10: string (nullable = true)
 |-- CEN_FIP13: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOM: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- CT10: string (nullable = true)
 |-- CT12: string (nullable = true)
 |-- CTCB10: string (nullable = true)
 |-- HD_2012: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING10: long (nullable = true)
 |-- LA_FIP10: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP_2010: long (nullable = true)
 |-- PUMA10: string (nullable = true)
 |-- SPA_2012: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP_DIST: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: double (nullable = true)
 |-- ShapeSTLength: double (nullable = true)
 |-- ZCTA10: string (nullable = true)
 |-- geometry: geometry (nulla

In [10]:
joined_df.select("Estimated Median Income").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------+
|Estimated Median Income|
+-----------------------+
|                84679.0|
|                50879.0|
|                50879.0|
|                50879.0|
|                50879.0|
|                84679.0|
|                50879.0|
|                50879.0|
|                50879.0|
|                50879.0|
|                50879.0|
|                50879.0|
|                50879.0|
|                41569.0|
|                41569.0|
|                41569.0|
|                41569.0|
|                41569.0|
|                38330.0|
|                50879.0|
+-----------------------+
only showing top 20 rows

In [11]:
joined_df.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

30539

In [12]:
from pyspark.sql.functions import col, sum as spark_sum, avg

LA_areas = joined_df.groupBy("COMM").agg(
                spark_sum("POP_2010").alias("total_population"),
                spark_sum("HOUSING10").alias("total_housing"),
                avg("Estimated Median Income").alias("average_income_per_house"),
                ST_Union_Aggr("geometry").alias("geometry"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
LA_areas.select("COMM", "total_population", "total_housing", "average_income_per_house").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+----------------+-------------+------------------------+
|              COMM|total_population|total_housing|average_income_per_house|
+------------------+----------------+-------------+------------------------+
|    Toluca Terrace|            1301|          541|                 48499.0|
|      Elysian Park|            5267|         1993|        35151.9801980198|
|          Longwood|            4210|         1474|                 38330.0|
|     Green Meadows|           19821|         5204|      30573.460674157304|
|  Cadillac-Corning|            6665|         2215|                 62425.2|
|          Mid-city|           14339|         6692|                 46571.0|
|   Lincoln Heights|           31144|         9316|       37990.15120274914|
|          Van Nuys|           86019|        29170|      43827.914666666664|
|    Gramercy Place|           10361|         3941|                 39269.0|
| Faircrest Heights|            3443|         1356|       52901.33802816901|

In [14]:
LA_areas.printSchema()
LA_areas.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- COMM: string (nullable = true)
 |-- total_population: long (nullable = true)
 |-- total_housing: long (nullable = true)
 |-- average_income_per_house: double (nullable = true)
 |-- geometry: geometry (nullable = true)

139

In [15]:
LA_areas.filter((col("total_population") == 0) | col("total_population").isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

In [16]:
result_df = LA_areas.withColumn(
    "median_income_per_person",
    (col("total_housing") * col("average_income_per_house")) / col("total_population")
)

result_df.count()
result_df.select("COMM", "median_income_per_person").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+------------------------+
|              COMM|median_income_per_person|
+------------------+------------------------+
|    Toluca Terrace|      20167.531898539586|
|      Elysian Park|      13301.290399592457|
|          Longwood|      13420.052256532066|
|     Green Meadows|      8027.0566242023415|
|  Cadillac-Corning|       20745.95918979745|
|          Mid-city|       21734.64899923286|
|   Lincoln Heights|       11363.86618946863|
|          Van Nuys|       14862.53351964876|
|    Gramercy Place|      14936.698098639128|
| Faircrest Heights|      20834.799409293402|
|     Boyle Heights|        8559.56907427354|
|  Lafayette Square|       16615.79380044813|
|     Granada Hills|      26715.151041954494|
|       North Hills|      16218.058419279136|
|        Northridge|       22904.10107292212|
|   Wilshire Center|      16115.224904545199|
|    Jefferson Park|      10258.852338413031|
|    Vermont Square|       8324.469247147457|
|Cloverdale/Cochran|      14893.43

In [20]:
sorted_df = result_df.orderBy(col("median_income_per_person").desc())
sorted_df.select("COMM", "median_income_per_person").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+------------------------+
|               COMM|median_income_per_person|
+-------------------+------------------------+
|  Pacific Palisades|        70526.2203104497|
|      Beverly Crest|       66513.90150799365|
|   Marina Peninsula|       65235.69402813004|
|Palisades Highlands|       65048.95354904471|
|            Bel Air|       63259.97685510228|
|  Mandeville Canyon|       61443.86522911051|
|          Brentwood|      60696.777650004915|
|            Carthay|      50282.692104378286|
|             Venice|       46575.69192582585|
|       Century City|       45707.53601562712|
|      Playa Del Rey|         45522.596580114|
|        Playa Vista|      44472.100292884345|
|    Hollywood Hills|      43713.597155829746|
|        Studio City|       42206.35394275496|
|   West Los Angeles|       40983.06782689424|
|      South Carthay|      39642.419795898146|
|             Encino|       39546.65508835928|
|       Miracle Mile|       38981.93388699816|
|        Ranc

In [24]:
from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

crime_data = crime_data.withColumn("geometry", ST_Point("LON", "LAT"))
crime_data.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer (nullable = true)
 |-- 

In [34]:
joined_crimes_df = result_df \
    .join(crime_data, ST_Within(result_df.geometry, crime_data.geometry), "inner")
joined_crimes_df.printSchema()
joined_crimes_df.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- COMM: string (nullable = true)
 |-- total_population: long (nullable = true)
 |-- total_housing: long (nullable = true)
 |-- average_income_per_house: double (nullable = true)
 |-- geometry: geometry (nullable = true)
 |-- median_income_per_person: double (nullable = true)
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon De

# Query 4

## Configurations

In [3]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2313,application_1732639283265_2273,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2253,application_1732639283265_2213,pyspark,idle,Link,Link,,
2255,application_1732639283265_2215,pyspark,idle,Link,Link,,
2257,application_1732639283265_2217,pyspark,idle,Link,Link,,
2260,application_1732639283265_2220,pyspark,idle,Link,Link,,
2266,application_1732639283265_2226,pyspark,idle,Link,Link,,
2267,application_1732639283265_2227,pyspark,idle,Link,Link,,
2268,application_1732639283265_2228,pyspark,idle,Link,Link,,
2269,application_1732639283265_2229,pyspark,idle,Link,Link,,
2270,application_1732639283265_2230,pyspark,idle,Link,Link,,
2280,application_1732639283265_2240,pyspark,idle,Link,Link,,


In [9]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2314,application_1732639283265_2274,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2253,application_1732639283265_2213,pyspark,idle,Link,Link,,
2255,application_1732639283265_2215,pyspark,idle,Link,Link,,
2257,application_1732639283265_2217,pyspark,idle,Link,Link,,
2260,application_1732639283265_2220,pyspark,idle,Link,Link,,
2266,application_1732639283265_2226,pyspark,idle,Link,Link,,
2267,application_1732639283265_2227,pyspark,idle,Link,Link,,
2268,application_1732639283265_2228,pyspark,idle,Link,Link,,
2269,application_1732639283265_2229,pyspark,idle,Link,Link,,
2270,application_1732639283265_2230,pyspark,idle,Link,Link,,
2280,application_1732639283265_2240,pyspark,idle,Link,Link,,


In [15]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2315,application_1732639283265_2275,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2253,application_1732639283265_2213,pyspark,idle,Link,Link,,
2255,application_1732639283265_2215,pyspark,idle,Link,Link,,
2260,application_1732639283265_2220,pyspark,idle,Link,Link,,
2266,application_1732639283265_2226,pyspark,idle,Link,Link,,
2267,application_1732639283265_2227,pyspark,idle,Link,Link,,
2268,application_1732639283265_2228,pyspark,idle,Link,Link,,
2269,application_1732639283265_2229,pyspark,idle,Link,Link,,
2270,application_1732639283265_2230,pyspark,idle,Link,Link,,
2280,application_1732639283265_2240,pyspark,idle,Link,Link,,
2281,application_1732639283265_2241,pyspark,idle,Link,Link,,


## PySpark & Sedona imports, Read datasets, register functions

In [17]:
from sedona.spark import *
from pyspark.sql.functions import col, sum, avg, regexp_replace, row_number, count
from pyspark.sql.window import Window
from pyspark.sql import SparkSession

# Create spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

flat_census_data = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")
# Print schema
flat_census_data.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG10: string (nullable = true)
 |-- BG10FIP10: string (nullable = true)
 |-- BG12: string (nullable = true)
 |-- CB10: string (nullable = true)
 |-- CEN_FIP13: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOM: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- CT10: string (nullable = true)
 |-- CT12: string (nullable = true)
 |-- CTCB10: string (nullable = true)
 |-- HD_2012: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING10: long (nullable = true)
 |-- LA_FIP10: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP_2010: long (nullable = true)
 |-- PUMA10: string (nullable = true)
 |-- SPA_2012: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP_DIST: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: double (nullable = true)
 |-- ShapeSTLength: double (nullable = true)
 |-- ZCTA10: string (nullable = true)
 |-- geometry: geometry (nulla

In [18]:
crimes_2010_19_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
crimes_2020_24_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", header=True)
crimes_data = crimes_2010_19_df.union(crimes_2020_24_df)
census_data = spark.read.json("s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson")
income_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
RE_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Query 4 - DataFrame API

In [19]:
@measure_time
def query4_dataframe(debug = False):
    '''Shows the two DFs that are the result of the query 4 using DataFrame API'''
    global flat_census_data, income_data, crimes_data, RE_data
    
    
    # Filter rows so that dataset refers only to Los Angeles communities
    LA_areas = flat_census_data.filter(col("CITY") == "Los Angeles")

    # Join datasets Census and Average Household Income on Zip Code
    joined_df = LA_areas.join(income_data, LA_areas["ZCTA10"] == income_data["Zip Code"], "inner")

    # Cast the "Estimated Median Income" string column as a double
    joined_df = joined_df.withColumn(
        "Estimated Median Income",
        regexp_replace(col("Estimated Median Income"), "[\\$,]", ""))
    joined_df = joined_df.withColumn(
        "Estimated Median Income",
        col("Estimated Median Income").cast("double")
    )

    # Group Dataframe by COMM (Community) -  sum of population, households, avg of average income per house, aggregate geometries
    weighted_avg = False
    if weighted_avg:
        LA_comms = joined_df.groupBy("COMM").agg(
            sum("POP_2010").alias("total_population"),
            sum("HOUSING10").alias("total_housing"),
            ( sum(col("Estimated Median Income")*col("HOUSING10"))/ sum("HOUSING10") ).alias("average_income_per_house"),
            ST_Union_Aggr("geometry").alias("geometry"))
    else:
        LA_comms = joined_df.groupBy("COMM").agg(
            sum("POP_2010").alias("total_population"),
            sum("HOUSING10").alias("total_housing"),
            avg("Estimated Median Income").alias("average_income_per_house"),
            ST_Union_Aggr("geometry").alias("geometry")) 

    # Extract median income per person from average income per house, total households and population
    result_df = LA_comms.withColumn(
        "median_income_per_person",
        (col("total_housing") * col("average_income_per_house")) / col("total_population")
    )

    # Order by descending median income per person
    sorted_df = result_df.orderBy(col("median_income_per_person").desc())

    # Show dataframe
    if debug: sorted_df.select("COMM", "median_income_per_person").show()

    
    ############################ Query 4 Starts here - The above was also done in previous query ##################################
    
    # select the top 3 and the last 3 communities after ranking areas by median_income_per_person
    df_ranked = sorted_df.withColumn("rank", row_number().over(Window.orderBy(col("median_income_per_person").desc())))
    top_3 = df_ranked.filter(df_ranked.rank <= 3).select("COMM", "median_income_per_person", "geometry")
    bottom_3 = df_ranked.filter(df_ranked.rank > (df_ranked.count() - 3)).select("COMM", "median_income_per_person", "geometry")
    if debug: top_3.show()
    if debug: bottom_3.show()

    # append geom column in crime data based on LON, LAT
    crime_geo = crimes_data.withColumn("geom", ST_Point("LON", "LAT"))

    # crime_geo_RE JOIN (the above top_3 and bottom_3 dataframes) on ST_Within(crime_geo.geom, result_df.geometry) ...
    crimes_top_3_comm = top_3 \
        .join(crime_geo, ST_Within(crime_geo.geom, top_3.geometry), "inner")
    crimes_bottom_3_comm = bottom_3 \
        .join(crime_geo, ST_Within(crime_geo.geom, bottom_3.geometry), "inner")

    # results above JOIN with RE codes on Vict Descent (same on both datasets)
    crimes_top_3_comm_RE = crimes_top_3_comm \
        .join(RE_data, RE_data["Vict Descent"] == crimes_top_3_comm["Vict Descent"], "inner") \
        .select( col("DR_NO"), col("AREA NAME"), col("LON"), col("LAT"), col("geom"), col("Vict Descent Full") \
        )
    crimes_bottom_3_comm_RE = crimes_bottom_3_comm \
        .join(RE_data, RE_data["Vict Descent"] == crimes_bottom_3_comm["Vict Descent"], "inner") \
        .select( col("DR_NO"), col("AREA NAME"), col("LON"), col("LAT"), col("geom"), col("Vict Descent Full") \
        )

    # group by Vict Descent select Vict Descent Full, count(*) as #
    crimes_top_3_comm_grouped = crimes_top_3_comm_RE \
        .groupBy("Vict Descent Full") \
        .agg(count("*").alias("#"))
    crimes_top_3_comm_grouped = crimes_top_3_comm_grouped.select("Vict Descent Full", "#")
    crimes_bottom_3_comm_grouped = crimes_bottom_3_comm_RE \
        .groupBy("Vict Descent Full") \
        .agg(count("*").alias("#")) 
    crimes_bottom_3_comm_grouped = crimes_bottom_3_comm_grouped.select("Vict Descent Full", "#")

    # order by # descending
    crimes_top_3_comm_grouped = crimes_top_3_comm_grouped.orderBy("#", ascending=False)
    crimes_bottom_3_comm_grouped = crimes_bottom_3_comm_grouped.orderBy("#", ascending=False)

    # show
    crimes_top_3_comm_grouped.show()
    crimes_bottom_3_comm_grouped.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Experiments

In [8]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

query4_dataframe()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 2g
Executor Cores: 1
+--------------------+-----+
|   Vict Descent Full|    #|
+--------------------+-----+
|               White|10704|
|               Other| 1758|
|Hispanic/Latin/Me...| 1119|
|             Unknown|  746|
|               Black|  646|
|         Other Asian|  423|
|             Chinese|   60|
|              Korean|   24|
|            Japanese|   23|
|            Filipino|   20|
|American Indian/A...|    7|
|         AsianIndian|    6|
|          Vietnamese|    5|
|            Hawaiian|    2|
|    Pacific Islander|    1|
|             Laotian|    1|
+--------------------+-----+

+--------------------+-----+
|   Vict Descent Full|    #|
+--------------------+-----+
|Hispanic/Latin/Me...|45288|
|               Black|13310|
|               White| 7091|
|               Other| 3280|
|             Unknown| 2626|
|         Other Asian| 1980|
|American Indian/A...|  299|
|             Chinese|  145|
|              Korean|  134|
|          

In [14]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

query4_dataframe()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 4g
Executor Cores: 2
+--------------------+-----+
|   Vict Descent Full|    #|
+--------------------+-----+
|               White|10704|
|               Other| 1758|
|Hispanic/Latin/Me...| 1119|
|             Unknown|  746|
|               Black|  646|
|         Other Asian|  423|
|             Chinese|   60|
|              Korean|   24|
|            Japanese|   23|
|            Filipino|   20|
|American Indian/A...|    7|
|         AsianIndian|    6|
|          Vietnamese|    5|
|            Hawaiian|    2|
|             Laotian|    1|
|    Pacific Islander|    1|
+--------------------+-----+

+--------------------+-----+
|   Vict Descent Full|    #|
+--------------------+-----+
|Hispanic/Latin/Me...|45288|
|               Black|13310|
|               White| 7091|
|               Other| 3280|
|             Unknown| 2626|
|         Other Asian| 1980|
|American Indian/A...|  299|
|             Chinese|  145|
|              Korean|  134|
|          

In [20]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

query4_dataframe()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 8g
Executor Cores: 4
+--------------------+-----+
|   Vict Descent Full|    #|
+--------------------+-----+
|               White|10704|
|               Other| 1758|
|Hispanic/Latin/Me...| 1119|
|             Unknown|  746|
|               Black|  646|
|         Other Asian|  423|
|             Chinese|   60|
|              Korean|   24|
|            Japanese|   23|
|            Filipino|   20|
|American Indian/A...|    7|
|         AsianIndian|    6|
|          Vietnamese|    5|
|            Hawaiian|    2|
|             Laotian|    1|
|    Pacific Islander|    1|
+--------------------+-----+

+--------------------+-----+
|   Vict Descent Full|    #|
+--------------------+-----+
|Hispanic/Latin/Me...|45288|
|               Black|13310|
|               White| 7091|
|               Other| 3280|
|             Unknown| 2626|
|         Other Asian| 1980|
|American Indian/A...|  299|
|             Chinese|  145|
|              Korean|  134|
|          

# Query 5

## Configurations

In [2]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1268,application_1732639283265_1231,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1208,application_1732639283265_1171,pyspark,idle,Link,Link,,
1211,application_1732639283265_1174,pyspark,idle,Link,Link,,
1212,application_1732639283265_1175,pyspark,idle,Link,Link,,
1213,application_1732639283265_1176,pyspark,idle,Link,Link,,
1233,application_1732639283265_1196,pyspark,idle,Link,Link,,
1236,application_1732639283265_1199,pyspark,idle,Link,Link,,
1238,application_1732639283265_1201,pyspark,idle,Link,Link,,
1240,application_1732639283265_1203,pyspark,idle,Link,Link,,
1243,application_1732639283265_1206,pyspark,idle,Link,Link,,
1244,application_1732639283265_1207,pyspark,idle,Link,Link,,


In [7]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1269,application_1732639283265_1232,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1208,application_1732639283265_1171,pyspark,idle,Link,Link,,
1211,application_1732639283265_1174,pyspark,idle,Link,Link,,
1212,application_1732639283265_1175,pyspark,idle,Link,Link,,
1213,application_1732639283265_1176,pyspark,idle,Link,Link,,
1233,application_1732639283265_1196,pyspark,idle,Link,Link,,
1236,application_1732639283265_1199,pyspark,idle,Link,Link,,
1238,application_1732639283265_1201,pyspark,idle,Link,Link,,
1240,application_1732639283265_1203,pyspark,idle,Link,Link,,
1243,application_1732639283265_1206,pyspark,idle,Link,Link,,
1244,application_1732639283265_1207,pyspark,idle,Link,Link,,


In [12]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "8",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1452,application_1732639283265_1413,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1451,application_1732639283265_1412,pyspark,idle,Link,Link,,
1452,application_1732639283265_1413,pyspark,idle,Link,Link,,✔


## PySpark imports, Read datasets, register functions

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, BooleanType
from pyspark.sql.functions import col, udf, sum, max, min, avg, count, mean, when, monotonically_increasing_id, dense_rank, window, row_number
from pyspark.sql.window import Window
from sedona.spark import *

spark = SparkSession.builder \
    .appName("Query 5") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)

# read datasets
crimes_2010_19_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
crimes_2020_24_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", header=True)
crimes_df = crimes_2010_19_df.union(crimes_2020_24_df)
police_stations_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Query 5 - DataFrame API

In [15]:
@measure_time
def query5_dataframe(df, debug = False):
    '''Shows the DF that is result of the query 5 using DataFrame API'''
    global police_stations_df
    
    joined_df = df.select('DR_NO', 'AREA NAME', 'LON', 'LAT') \
        .filter( (col('LON') != '0') & (col('LON') != 0) ) \
        .withColumn("crime_point", ST_Point("LON", "LAT")) \
        .join(police_stations_df) \
        .withColumn("police_point", ST_Point("X", "Y")) \
        .withColumn('distance', ST_DistanceSphere("crime_point", "police_point")/1000) # divide with 1000 to conver into km \

    crimes_in_null_island = df.filter( (col('LON') == '0') | (col('LON') == 0) ).count()
        
    if debug: joined_df.filter(col('DR_NO') == '001307355').show(30)
    # In this DF, for each DR_NO we have 21 rows for the distances between the crime location and police departments 

    # Define a window 
    windowSpec = Window.partitionBy("DR_NO")
    extended_df = joined_df.withColumn("min_distance", when(col("distance").isNotNull(), min("distance").over(windowSpec)).otherwise(None)) \
        .filter(col('distance') == col('min_distance'))
    if debug: extended_df.filter(col('DR_NO') == '001307355').show(30)
    if debug: extended_df.filter(col('DIVISION') == 'HOLLENBECK').orderBy('min_distance', ascending=True).show(30)
    if debug: extended_df.filter(col('DIVISION') == 'HOLLENBECK').agg(avg('min_distance').alias('avg')).show()
    # Now we have each DR_NO only once

    grouped_df = extended_df.groupBy("DIVISION") \
        .agg( \
             avg("min_distance").alias("average_distance"), \
             count("*").alias("#") \
        ) \
        .select('DIVISION', 'average_distance', '#') \
        .orderBy(["#"], ascending=[False])

    schema = StructType([
        StructField("DIVISION", StringType(), nullable=True), 
        StructField("average_distance", FloatType(), nullable=True), 
        StructField("#", IntegerType(), nullable=False) 
    ])
    # Create a DataFrame for the Null Island row
    null_island_row = spark.createDataFrame([
        ("Unknown", None, crimes_in_null_island) 
    ], schema)

    grouped_df = grouped_df.union(null_island_row)
    
    if debug: grouped_df.agg(sum('#').alias('incidents')).show()

    # This DF is grouped by police departments/divisions 
    # so we see the average distance of crimes that happened closer to that and the number of these incidents/crimes
    grouped_df.show(50)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Experiments

In [6]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

query5_dataframe(crimes_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 8g
Executor Cores: 4
+----------------+------------------+------+
|        DIVISION|  average_distance|     #|
+----------------+------------------+------+
|       HOLLYWOOD|2.0762639601787205|224340|
|        VAN NUYS| 2.953369742819787|210134|
|       SOUTHWEST|2.1913988057808838|188901|
|        WILSHIRE|2.5926655329787796|185996|
|     77TH STREET| 1.716544971970102|171827|
|         OLYMPIC|1.7236036971780937|170897|
| NORTH HOLLYWOOD|2.6430060941415676|167854|
|         PACIFIC|3.8500706553079027|161359|
|         CENTRAL|0.9924764374568903|153871|
|         RAMPART|1.5345341879190049|152736|
|       SOUTHEAST|2.4218662158881794|152176|
|     WEST VALLEY| 3.035671216314078|138643|
|         TOPANGA|3.2969548417555608|138217|
|        FOOTHILL| 4.250921708424991|134896|
|          HARBOR|3.7025615993565033|126747|
|      HOLLENBECK|2.6801812377068224|115837|
|WEST LOS ANGELES| 2.792457289034108|115781|
|          NEWTON|1.6346357397097435|111

In [11]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

query5_dataframe(crimes_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 4
Executor Memory: 4g
Executor Cores: 2
+----------------+------------------+------+
|        DIVISION|  average_distance|     #|
+----------------+------------------+------+
|       HOLLYWOOD|2.0762639601787196|224340|
|        VAN NUYS|2.9533697428197883|210134|
|       SOUTHWEST| 2.191398805780884|188901|
|        WILSHIRE|2.5926655329787796|185996|
|     77TH STREET|1.7165449719701025|171827|
|         OLYMPIC|1.7236036971780941|170897|
| NORTH HOLLYWOOD| 2.643006094141567|167854|
|         PACIFIC|3.8500706553079027|161359|
|         CENTRAL|0.9924764374568901|153871|
|         RAMPART|1.5345341879190044|152736|
|       SOUTHEAST| 2.421866215888179|152176|
|     WEST VALLEY|3.0356712163140793|138643|
|         TOPANGA|3.2969548417555603|138217|
|        FOOTHILL| 4.250921708424989|134896|
|          HARBOR| 3.702561599356503|126747|
|      HOLLENBECK|2.6801812377068237|115837|
|WEST LOS ANGELES|2.7924572890341084|115781|
|          NEWTON|1.6346357397097424|111

In [16]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

query5_dataframe(crimes_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 8
Executor Memory: 2g
Executor Cores: 1
+----------------+------------------+------+
|        DIVISION|  average_distance|     #|
+----------------+------------------+------+
|       HOLLYWOOD|2.0762639601787205|224340|
|        VAN NUYS|2.9533697428197865|210134|
|       SOUTHWEST|2.1913988057808846|188901|
|        WILSHIRE|2.5926655329787787|185996|
|     77TH STREET|1.7165449719701025|171827|
|         OLYMPIC|1.7236036971780935|170897|
| NORTH HOLLYWOOD|2.6430060941415676|167854|
|         PACIFIC|3.8500706553079027|161359|
|         CENTRAL|0.9924764374568898|153871|
|         RAMPART|1.5345341879190046|152736|
|       SOUTHEAST| 2.421866215888179|152176|
|     WEST VALLEY|3.0356712163140793|138643|
|         TOPANGA|3.2969548417555603|138217|
|        FOOTHILL| 4.250921708424989|134896|
|          HARBOR|3.7025615993565038|126747|
|      HOLLENBECK|2.6801812377068233|115837|
|WEST LOS ANGELES|2.7924572890341075|115781|
|          NEWTON| 1.634635739709743|111