In [7]:
from pyspark.sql import SparkSession, functions as F
from sedona.register.geo_registrator import SedonaRegistrator
from sedona.spark import SedonaContext
from pyspark.sql.types import StringType

# Initialize Spark Session and Sedona Context
spark = SparkSession.builder \
    .appName("CrimeAnalysisSpatialJoinSQL") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .getOrCreate()

sedona = SedonaContext.create(spark)

# Load datasets
census_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
income_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
crime_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"

# Load Census Data (GeoJSON format)
census_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(census_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

# Flatten the GeoJSON structure and filter valid populations
census_df = census_df.select(
    F.col("properties.ZCTA10").alias("ZCTA10"),
    F.col("properties.POP_2010").alias("Population"),
    F.col("properties.COMM").alias("COMM"),
    F.col("geometry").alias("geometry"),
    F.col("properties.HOUSING10").alias("HOUSING10"),
).filter(F.col("Population") > 0)  # Exclude zero or negative population

# Load Crime Data (Parquet format)
crime_df = spark.read.parquet(crime_path)

# Create geometry column using ST_Point
crime_df = crime_df.withColumn("geometry", F.expr("ST_Point(LON, LAT)")) \
                   .select("geometry")

# Load Income Data (CSV format)
income_df = spark.read.csv(income_path, header=True, inferSchema=True)
income_df = income_df.withColumn(
    "Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
).withColumnRenamed("Zip Code","Zip").drop("Estimated Median Income")

census_agg = census_df.groupBy("COMM").agg(
    F.sum("Population").alias("Total_Population"),
    F.sum("HOUSING10").alias("Total_Households")
)
# Calculate total income per community (sum of income contributions per zip)
income_total = census_df.join(income_df, census_df.ZCTA10 == income_df.Zip, "inner") \
    .groupBy("COMM").agg(
        F.sum(F.col("Income") * F.col("HOUSING10")).alias("Total_Income")
    )

# Join census and income data
census_income = census_agg.join(income_total, "COMM", "inner")

# Calculate Mean Income Per Person
census_income = census_income.withColumn(
    "Mean_Income_Per_Person",
    F.col("Total_Income") / F.col("Total_Population")
)

# Aggregate crime data
crime_agg = crime_df.alias("cr").join(
    census_df.alias("c"),
    F.expr("ST_Within(cr.geometry, c.geometry)"),
    "inner"
).groupBy("c.COMM").agg(
    F.count("*").alias("Total_Crimes")
)

# Final join for all data
final_result = census_income.join(crime_agg, "COMM", "left_outer") \
    .withColumn(
        "Crime_Per_Person_Ratio",
        F.col("Total_Crimes") / F.col("Total_Population")
    )

# Replace NULL values with 0 in the columns "Total_Crimes" and "Crime_Per_Person_Ratio"
final_result = final_result.fillna({
    "Total_Crimes": 0,
    "Crime_Per_Person_Ratio": 0
})

# Display final results
final_result.select(
    "COMM",
    "Total_Population",
    "Mean_Income_Per_Person",
    "Total_Crimes",
    "Crime_Per_Person_Ratio"
).show(truncate=False)


# Broadcast Join Hint
broadcast_result = census_income.hint("broadcast").join(crime_agg, "COMM", "left_outer")

# Explain Broadcast Plan
print("Broadcast Join Plan:")
broadcast_result.explain()

# Merge Join Hint
merge_result = census_income.hint("merge").join(crime_agg, "COMM", "left_outer")

# Explain Merge Plan
print("Merge Join Plan:")
merge_result.explain()

# Shuffle Hash Join Hint
shuffle_hash_result = census_income.hint("shuffle_hash").join(crime_agg, "COMM", "left_outer")

# Explain Shuffle Hash Plan
print("Shuffle Hash Join Plan:")
shuffle_hash_result.explain()

# Shuffle Replicate NL Join Hint
shuffle_replicate_result = census_income.hint("shuffle_replicate_nl").join(crime_agg, "COMM", "left_outer")

# Explain Shuffle Replicate NL Plan
print("Shuffle Replicate NL Join Plan:")
shuffle_replicate_result.explain()

# Count total rows in the final result
print("Total communities:", final_result.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------+----------------+----------------------+------------+----------------------+
|COMM                 |Total_Population|Mean_Income_Per_Person|Total_Crimes|Crime_Per_Person_Ratio|
+---------------------+----------------+----------------------+------------+----------------------+
|Culver City          |38883           |33644.97549057429     |345         |0.00887277216264177   |
|North Lancaster      |1151            |19097.289313640315    |0           |0.0                   |
|Rosewood/East Gardena|1164            |16165.823024054982    |101         |0.08676975945017182   |
|East Rancho Dominguez|15135           |8830.036339610175     |0           |0.0                   |
|Toluca Terrace       |1301            |20167.531898539586    |289         |0.22213681783243658   |
|Elysian Park         |5267            |13865.32770077843     |3191        |0.6058477311562559    |
|Longwood             |4210            |13420.052256532066    |3062        |0.7273159144893112    |
