# Query 4
## Reading files from S3
The first code cell reads the main csv and geojson file, as well as the rest of the csv datasets.  

In [1]:
from sedona.spark import *
from pyspark import SparkContext
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import min, count, avg
import time
import logging

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2478,application_1732639283265_2437,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
def init_spark(app_name, cores, memory):
    global spark
    spark = SparkSession.builder \
        .appName(app_name) \
        .config("spark.network.timeout", "600s") \
        .config("spark.master", "yarn") \
        .config("spark.executor.instances", 2) \
        .config("spark.executor.cores", cores) \
        .config("spark.executor.memory", memory) \
        .getOrCreate()

    sedona = SedonaContext.create(spark)
    
    return spark, sedona


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
def load_data(spark, sedona):
    # Load the crime data, LA income and RE codes csv 
    crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
    la_income = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
    re_codes = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)
    police_stations = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True, inferSchema=True)

    geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
    blocks_df = sedona.read.format("geojson") \
                .option("multiLine", "true").load(geojson_path) \
                .selectExpr("explode(features) as features") \
                .select("features.*")
    # Formatting magic
    flattened_blocks_df = blocks_df.select( \
                    [col(f"properties.{col_name}").alias(col_name) for col_name in \
                    blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
                .drop("properties") \
                .drop("type")
    return crime_data, la_income, re_codes, police_stations, flattened_blocks_df

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [39]:
def process_query4(crime_data, la_income, re_codes, flattened_blocks_df):
    # Add geometry column to crime_data
    crime_data = crime_data.withColumn("geom", ST_Point(col("LON"), col("LAT")))
    crime_data_location = crime_data.join(flattened_blocks_df, ST_Within(crime_data["geom"], flattened_blocks_df["geometry"]), "inner")

    # Filter crimes for 2015
    crime_data_2015 = crime_data_location.filter(col("DATE OCC").contains("2015"))

    # Filter the regions we found in the previous query
    top_income = la_income.filter(
        col("Community").contains("Marina del Rey") |
        col("Community").contains("Century City") |
        col("Community").contains("Pacific Palisades")
    ).drop("Estimated Median Income")
    bottom_income = la_income.filter(
        col("Community").contains("Vernon") |
        col("Community").contains("Watts") |
        col("Community").contains("Boyle Heights")
    ).drop("Estimated Median Income")

    # Join data by ZIP codes and Community names
    richest_crime_data = crime_data_2015.join(
        top_income,
        (top_income["Community"].contains(crime_data_2015["COMM"])) &
        (crime_data_2015["ZCTA10"] == la_income["Zip Code"])
    ).drop("Community")
    
    poorest_crime_data = crime_data_2015.join(
        bottom_income,
        (bottom_income["Community"].contains(crime_data_2015["COMM"])) &
        (crime_data_2015["ZCTA10"] == la_income["Zip Code"])
    ).drop("Community")
    
    # Add information about the Race and Ethnicity
    richest_crimes_info = richest_crime_data.join(re_codes, richest_crime_data["Vict Descent"] == re_codes["Vict Descent"])
    poorest_crimes_info = poorest_crime_data.join(re_codes, poorest_crime_data["Vict Descent"] == re_codes["Vict Descent"])

    # Group and count
    richest_counts = richest_crimes_info.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)
    poorest_counts = poorest_crimes_info.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

    return richest_counts, poorest_counts


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [41]:
def run_query4(config_name, cores, memory):
    spark, sedona = init_spark(config_name, cores, memory)
    print(f"Running Query 4 with {config_name}...")
    
    start_time = time.time()    # Load data
    crime_data, la_income, re_codes, police_stations, flattened_blocks_df = load_data(spark, sedona)

    # Process Query 4
    richest_counts, poorest_counts = process_query4(crime_data, la_income, re_codes, flattened_blocks_df)

    # Show results
    print("Richest ZIP Codes Crime Counts:")
    richest_counts.select("Vict Descent Full", "count").show(truncate = False)
    print("Poorest ZIP Codes Crime Counts:")
    poorest_counts.select("Vict Descent Full", "count").show(truncate = False)
    end_time = time.time()
    print(f"Execution Time: {end_time - start_time:.2f} seconds\n")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [42]:
configs = [
    {"name": "1 Core / 2 GB Memory", "cores": 1, "memory": "2g"},
    {"name": "2 Cores / 4 GB Memory", "cores": 2, "memory": "4g"},
    {"name": "4 Cores / 8 GB Memory", "cores": 4, "memory": "8g"}
]

for config in configs:
    run_query4(config["name"], config["cores"], config["memory"])


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Running Query 4 with 1 Core / 2 GB Memory...
Richest ZIP Codes Crime Counts:
+------------------------------+-----+
|Vict Descent Full             |count|
+------------------------------+-----+
|White                         |668  |
|Other                         |99   |
|Hispanic/Latin/Mexican        |77   |
|Black                         |50   |
|Unknown                       |48   |
|Other Asian                   |23   |
|Chinese                       |1    |
|American Indian/Alaskan Native|1    |
+------------------------------+-----+

Poorest ZIP Codes Crime Counts:
+----------------------+-----+
|Vict Descent Full     |count|
+----------------------+-----+
|Hispanic/Latin/Mexican|3157 |
|Black                 |1292 |
|White                 |195  |
|Unknown               |167  |
|Other                 |113  |
|Other Asian           |51   |
|Filipino              |3    |
|Chinese               |2    |
|Guamanian             |1    |
|Pacific Islander      |1    |
+------------------