# Query 4
## Reading files from S3
The first code cell reads the main csv and geojson file, as well as the rest of the csv datasets.  

In [1]:
from sedona.spark import *
from pyspark import SparkContext
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import min, count, avg
import time
import logging

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2309,application_1732639283265_2269,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
def init_spark(app_name, cores, memory):
    global spark
    spark = SparkSession.builder \
        .appName(app_name) \
        .config("spark.network.timeout", "600s") \
        .config("spark.master", "yarn") \
        .config("spark.executor.instances", 2) \
        .config("spark.executor.cores", cores) \
        .config("spark.executor.memory", memory) \
        .getOrCreate()

    sedona = SedonaContext.create(spark)
    
    return spark, sedona


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
def load_data(spark, sedona):
    # Load the crime data, LA income and RE codes csv 
    crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
    la_income = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
    re_codes = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)
    police_stations = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True, inferSchema=True)

    geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
    blocks_df = sedona.read.format("geojson") \
                .option("multiLine", "true").load(geojson_path) \
                .selectExpr("explode(features) as features") \
                .select("features.*")
    # Formatting magic
    flattened_blocks_df = blocks_df.select( \
                    [col(f"properties.{col_name}").alias(col_name) for col_name in \
                    blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
                .drop("properties") \
                .drop("type")
    return crime_data, la_income, re_codes, police_stations, flattened_blocks_df

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
def process_query4(crime_data, la_income, re_codes, flattened_blocks_df):
    # Add geometry column to crime_data
    crime_data = crime_data.withColumn("geom", ST_Point(col("LON"), col("LAT")))
    crime_data_location = crime_data.join(flattened_blocks_df, ST_Within(crime_data["geom"], flattened_blocks_df["geometry"]), "inner")

    # Filter crimes for 2015
    crime_data_2015 = crime_data_location.filter(col("DATE OCC").contains("2015"))

    # Sort income data
    la_income_sorted_up = la_income.orderBy(col("Estimated Median Income").desc())
    la_income_sorted_down = la_income.orderBy(col("Estimated Median Income").asc())

    # Select top and bottom income ZIP codes
    top_income_zip = la_income_sorted_up.limit(3).select("Zip Code")
    bottom_income_zip = la_income_sorted_down.limit(3).select("Zip Code")

    # Collect ZIP codes into lists
    top_zip_codes = [row["Zip Code"] for row in top_income_zip.collect()]
    bottom_zip_codes = [row["Zip Code"] for row in bottom_income_zip.collect()]

    # Join data and filter by ZIP codes
    crime_income_data = crime_data_2015.join(la_income, crime_data_2015["ZCTA10"] == la_income["Zip Code"])
    crime_info = crime_income_data.join(re_codes, crime_income_data["Vict Descent"] == re_codes["Vict Descent"])

    richest_crimes = crime_info.filter(col("Zip Code").isin(top_zip_codes))
    poorest_crimes = crime_info.filter(col("Zip Code").isin(bottom_zip_codes))

    # Group and count
    richest_counts = richest_crimes.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)
    poorest_counts = poorest_crimes.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

    return richest_counts, poorest_counts


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
def run_query4(config_name, cores, memory):
    spark, sedona = init_spark(config_name, cores, memory)
    print(f"Running Query 4 with {config_name}...")
    
    start_time = time.time()    # Load data
    crime_data, la_income, re_codes, police_stations, flattened_blocks_df = load_data(spark, sedona)

    # Process Query 4
    richest_counts, poorest_counts = process_query4(crime_data, la_income, re_codes, flattened_blocks_df)

    # Show results
    print("Richest ZIP Codes Crime Counts:")
    richest_counts.select("Vict Descent Full", "count").show()    
    print("Poorest ZIP Codes Crime Counts:")
    poorest_counts.select("Vict Descent Full", "count").show()
    end_time = time.time()
    print(f"Execution Time: {end_time - start_time:.2f} seconds\n")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
configs = [
    {"name": "1 Core / 2 GB Memory", "cores": 1, "memory": "2g"},
    {"name": "2 Cores / 4 GB Memory", "cores": 2, "memory": "4g"},
    {"name": "4 Cores / 8 GB Memory", "cores": 4, "memory": "8g"}
]

for config in configs:
    run_query4(config["name"], config["cores"], config["memory"])


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Running Query 4 with 1 Core / 2 GB Memory...
Richest ZIP Codes Crime Counts:
+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|               White|   57|
|               Other|   15|
|Hispanic/Latin/Me...|    8|
|             Unknown|    8|
|               Black|    5|
|         Other Asian|    2|
+--------------------+-----+

Poorest ZIP Codes Crime Counts:
+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|               White|  470|
|               Other|  128|
|Hispanic/Latin/Me...|  127|
|             Unknown|   90|
|               Black|   56|
|         Other Asian|   24|
|            Filipino|    2|
|            Japanese|    1|
+--------------------+-----+

Execution Time: 103.12 seconds

Running Query 4 with 2 Cores / 4 GB Memory...
Richest ZIP Codes Crime Counts:
+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|               White|   57|
|               Other|   