# Query 4
## Reading files from S3
The first code cell reads the main csv and geojson file, as well as the rest of the csv datasets.  

In [40]:
from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import min, count, avg

# Create spark Session
# 2 executors x 1 core/2GB memory
spark = SparkSession.builder \
    .appName("Query 4 - 1 Core, 2GB Memory") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.cores", 1) \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()
# 2 executors × 2 cores/4 GB memory
spark = SparkSession.builder \
    .appName("Query 4 - 2 Cores, 4GB Memory") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.cores", 2) \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# 2 executors × 4 cores/8 GB memory
spark = SparkSession.builder \
    .appName("Query 4 - 4 Cores, 8GB Memory") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.cores", 4) \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()


# Create sedona context
sedona = SedonaContext.create(spark)

# Load the crime data, LA income and RE codes csv 
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
la_income = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
re_codes = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)
police_stations = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True, inferSchema=True)

geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_blocks_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Crime data Geometry 
Add the geometry column in the crime_data

In [3]:
# Create geometry column for crime data
crime_data = crime_data.withColumn("geom", ST_Point(col("LON"), col("LAT")))
crime_data_location = crime_data.join(flattened_blocks_df, ST_Within(crime_data["geom"], flattened_blocks_df["geometry"]), "inner")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
crime_data_location.select("geom").show(3, truncate=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|                geom|
+--------------------+
|POINT (-118.2902 ...|
|POINT (-118.2902 ...|
|POINT (-118.2924 ...|
+--------------------+
only showing top 3 rows

## Filtering 
Only keep the crimes commited in 2015. 
Then, sort the zip codes on decreasing income. 

In [5]:
# Keep only the 2015 crimes
crime_data_2015 = crime_data_location.filter(col("DATE OCC").contains("2015"))

# Sort la_income zip codes by decreasing income. 
la_income_sorted_up = la_income.orderBy(col("Estimated Median Income").desc())
la_income_sorted_down = la_income.orderBy(col("Estimated Median Income").asc())

# Select the top three and bottom 3 income ZIP codes
top_income_zip = la_income_sorted_up.limit(3).select("Zip Code")
bottom_income_zip = la_income_sorted_down.limit(3).select("Zip Code")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
top_income_zip.show()
bottom_income_zip.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+
|Zip Code|
+--------+
|   90290|
|   90067|
|   90808|
+--------+

+--------+
|Zip Code|
+--------+
|   91307|
|   90277|
|   91350|
+--------+

In [7]:
crime_data_2015.select("Vict Descent").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+
|Vict Descent|
+------------+
|        NULL|
|           O|
|        NULL|
|           W|
|           W|
|           H|
|           W|
|           W|
|           W|
|           W|
|           X|
|           W|
|           W|
|           H|
|        NULL|
|           W|
|        NULL|
|           W|
|           H|
|           B|
+------------+
only showing top 20 rows

## Join data 
Final join of data to have the full crime information dataframe


In [8]:
# Join the crime_data_2015 data frame with the income dataframe 
# to relate the crime information with the income data
crime_income_data= crime_data_2015.join(la_income, (crime_data_2015["ZCTA10"] == la_income["Zip Code"]))

#  Now join the joint crime-income information with the race and ethnicity codes 
#  on the Victim Descent column
crime_info = crime_income_data.join(re_codes, (crime_income_data["Vict Descent"] == re_codes["Vict Descent"]))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
crime_info.select("Vict Descent Full").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|   Vict Descent Full|
+--------------------+
|               Other|
|               Black|
|Hispanic/Latin/Me...|
|               Black|
|               Black|
|               White|
|               Other|
|Hispanic/Latin/Me...|
|               White|
|               White|
|               Other|
|               White|
|Hispanic/Latin/Me...|
|Hispanic/Latin/Me...|
|             Unknown|
|               Other|
|               Black|
|               Black|
|               Other|
|Hispanic/Latin/Me...|
+--------------------+
only showing top 20 rows

## Group and Count


In [10]:
# Collect the values of the top income zip codes into a list
top_zip_codes = [row["Zip Code"] for row in top_income_zip.collect()]
bottom_zip_codes = [row["Zip Code"] for row in bottom_income_zip.collect()]

# Keep the info of the crimes made in the Zip Codes 
richest_crimes = crime_info.filter(col("Zip Code").isin(top_zip_codes))
poorest_crimes = crime_info.filter(col("Zip Code").isin(bottom_zip_codes))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
richest_crimes.select("Vict Descent Full").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|   Vict Descent Full|
+--------------------+
|Hispanic/Latin/Me...|
|               White|
|               White|
|               White|
|               Other|
|               Black|
|               White|
|               White|
|               White|
|               White|
|               White|
|Hispanic/Latin/Me...|
|               Other|
|               Other|
|               Other|
|               White|
|               White|
|               White|
|               White|
|               White|
+--------------------+
only showing top 20 rows

In [12]:
# Group and count for richest crimes
richest_counts = richest_crimes.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

# Group and count for poorest crimes
poorest_counts = poorest_crimes.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Results of Q4

In [13]:
richest_counts.select("Vict Descent Full", "count").show()
poorest_counts.select("Vict Descent Full", "count").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|               White|   57|
|               Other|   15|
|Hispanic/Latin/Me...|    8|
|             Unknown|    8|
|               Black|    5|
|         Other Asian|    2|
+--------------------+-----+

+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|               White|  470|
|               Other|  128|
|Hispanic/Latin/Me...|  127|
|             Unknown|   90|
|               Black|   56|
|         Other Asian|   24|
|            Filipino|    2|
|            Japanese|    1|
+--------------------+-----+