In [3]:
from pyspark.sql import SparkSession, functions as F
from sedona.register.geo_registrator import SedonaRegistrator
from sedona.spark import SedonaContext
from pyspark.sql.types import StringType

# Initialize Spark Session and Sedona Context
spark = SparkSession.builder \
    .appName("CrimeAnalysisSpatialJoinSQL") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .getOrCreate()

sedona = SedonaContext.create(spark)

# Load datasets
census_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
income_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
crime_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"

# Load Census Data (GeoJSON format)
census_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(census_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

# Flatten the GeoJSON structure and filter valid populations
census_df = census_df.select(
    F.col("properties.ZCTA10").alias("ZCTA10"),
    F.col("properties.POP_2010").alias("Population"),
    F.col("properties.COMM").alias("COMM"),
    F.col("geometry").alias("geometry"),
    F.col("properties.HOUSING10").alias("HOUSING10"),
).filter(F.col("Population") > 0)  # Exclude zero or negative population

# Load Income Data (CSV format)
income_df = spark.read.csv(income_path, header=True, inferSchema=True)
income_df = income_df.withColumn(
    "Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
).withColumnRenamed("Zip Code","Zip").drop("Estimated Median Income")

# Calculate total income per community (sum of income contributions per zip)
income_total = census_df.join(income_df, census_df.ZCTA10 == income_df.Zip, "inner") \
    .groupBy("COMM").agg(
        F.sum(F.col("Income") * F.col("HOUSING10")).alias("Total_Income")
    )

# Aggregate census data by community
census_agg = census_df.groupBy("COMM").agg(
    F.sum("Population").alias("Total_Population"),
    F.sum("HOUSING10").alias("Total_Households")
)

# Join census and income data
census_income = census_agg.join(income_total, "COMM", "inner")

# Calculate Mean Income Per Person
census_income = census_income.withColumn(
    "Mean_Income_Per_Person",
    F.col("Total_Income") / F.col("Total_Population")
)

# Step 1: Get the top 3 and bottom 3 communities by Mean Income Per Person
top_communities = census_income.orderBy(F.col("Mean_Income_Per_Person"), ascending=False).limit(3)
bottom_communities = census_income.orderBy(F.col("Mean_Income_Per_Person"), ascending=True).limit(3)

# Step 2: Filter crime data for top and bottom communities
top_community_codes = [row['COMM'] for row in top_communities.collect()]
bottom_community_codes = [row['COMM'] for row in bottom_communities.collect()]

# Load Crime Data (Parquet format)
crime_df = spark.read.parquet(crime_path)

# Create geometry column using ST_Point
crime_df = crime_df.withColumn("geometry", F.expr("ST_Point(LON, LAT)"))

# Step 3: Join the crime data with census data to include 'COMM' (Community) info
crime_with_comm_df = crime_df.alias("cr").join(
    census_df.alias("c"),
    F.expr("ST_Within(cr.geometry, c.geometry)"),
    "inner"
)
# .select(
#     "census_df.COMM"
# )

# Filter crime data for top communities (those with highest income)
top_crime_df = crime_with_comm_df.filter(F.col("COMM").isin(top_community_codes))

# Filter crime data for bottom communities (those with lowest income)
bottom_crime_df = crime_with_comm_df.filter(F.col("COMM").isin(bottom_community_codes))

# Step 4: Load race codes dataset to translate 'Vict Descent'
race_codes_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
race_codes_df = spark.read.csv(race_codes_path, header=True, inferSchema=True)

# Join the crime data with the race codes to get the full description of 'Vict Descent'
top_crime_with_race_df = top_crime_df.join(race_codes_df, top_crime_df["Vict Descent"] == race_codes_df["Vict Descent"], "left_outer")

# Aggregate by 'Vict Descent Full' and count victims for top communities
top_crime_race_count = top_crime_with_race_df.groupBy("Vict Descent Full").agg(
    F.count("*").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)

# Join the crime data with the race codes for bottom communities
bottom_crime_with_race_df = bottom_crime_df.join(race_codes_df, bottom_crime_df["Vict Descent"] == race_codes_df["Vict Descent"], "left_outer")

# Aggregate by 'Vict Descent Full' and count victims for bottom communities
bottom_crime_race_count = bottom_crime_with_race_df.groupBy("Vict Descent Full").agg(
    F.count("*").alias("Victim_Count")
).orderBy(F.col("Victim_Count"), ascending=False)

# Step 5: Export the results to new files
# top_crime_race_count.write.option("header", "true", "overwrite").csv("top_communities_race_breakdown.csv")
# bottom_crime_race_count.write.option("header", "true", "overwrite").csv("bottom_communities_race_breakdown.csv")

# print("Top and Bottom Communities Crime Race Breakdown exported to CSV files.")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
top_crime_race_count.show()
bottom_crime_race_count.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+------------+
|   Vict Descent Full|Victim_Count|
+--------------------+------------+
|               White|        5726|
|               Other|         690|
|Hispanic/Latin/Me...|         558|
|                NULL|         477|
|             Unknown|         359|
|               Black|         260|
|         Other Asian|         204|
|             Chinese|          29|
|            Japanese|          20|
|            Filipino|          16|
|              Korean|          13|
|         AsianIndian|           5|
|American Indian/A...|           4|
|          Vietnamese|           3|
|            Hawaiian|           1|
+--------------------+------------+

+--------------------+------------+
|   Vict Descent Full|Victim_Count|
+--------------------+------------+
|Hispanic/Latin/Me...|       19703|
|               Black|        6316|
|                NULL|        4715|
|               White|        1469|
|               Other|         807|
|             Unknown|     