In [77]:
from pyspark.sql import SparkSession, functions as F
from sedona.register.geo_registrator import SedonaRegistrator
from pyspark.sql.types import StringType
import re

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("CrimeAnalysisSpatialJoinSQL") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .getOrCreate()

# Register Sedona functions
SedonaRegistrator.registerAll(spark)

# Load datasets
census_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
income_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
crime_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"

# Load Census Data (GeoJSON format)
census_df = spark.read.format("geojson").option("multiLine", "true").load(census_path)
census_df = census_df.selectExpr("explode(features) as features") \
    .select(
        F.col("features.*"),
        F.col("features.properties.ZCTA10").alias("ZCTA10"),
        F.col("features.properties.POP_2010").alias("Population"),
        F.col("features.properties.COMM").alias("COMM"),
        F.expr("CAST(features.geometry AS STRING)").alias("geometry_str")  # Cast geometry as string
    ).drop("type", "properties")

# Convert geometry from GeoJSON string to geometry object
census_df = census_df.withColumn("geometry", F.expr("ST_GeomFromWKT(geometry_str)")).drop("geometry_str")

# Load Crime Data (Parquet format)
crime_df = spark.read.parquet(crime_path)
# Use ST_Point to create geometry column from LAT and LON
crime_df = crime_df.withColumn(
    "geometry",
    F.expr("ST_Point(LON, LAT)")
).select('geometry', 'DR_NO')

# crime_df.printSchema()
# crime_df.show(5)
# census_df.printSchema()
# census_df.show(5)
# Load Income Data (CSV format)
income_df = spark.read.csv(income_path, header=True, inferSchema=True)
income_df = income_df.withColumn(
    "Total_Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
).drop("Estimated Median Income")

income_df.printSchema()

# Perform the spatial join using DataFrame API

# Perform the spatial join
spatial_join_df = (
    census_df.alias("c")
    .join(
        crime_df.alias("cr"),
        F.expr("ST_Contains(c.geometry, cr.geometry)"),  # Spatial join condition
        "inner"  # Join type (can also be "left", "right", etc.)
    )
    .select(
        F.col("c.geometry").alias("census_geometry"),
        F.col("cr.geometry").alias("crime_geometry"),
        F.col("c.ZCTA10"),
        F.col("c.Population"),
        F.col("c.COMM"),
        F.col("cr.DR_NO"),
    )
)
# Show the result
# Perform join based on ZCTA10

final_df = spatial_join_df.join(
    income_df,
    spatial_join_df["ZCTA10"] == income_df["Zip Code"],
    "left"
).groupBy(
    "COMM", "ZCTA10", "Population", "Total_Income"
).agg(
    F.count("DR_NO").alias("Total_Crimes"),
    (F.col("Total_Income") / F.col("Population")).alias("Avg_Income_Per_Person"),
    (F.count("DR_NO") / F.col("Population")).alias("Crime_Per_Person_Ratio")
)

# Replace null values if necessary
final_df = final_df.fillna({"Total_Income": 0, "Avg_Income_Per_Person": 0})

# Show the results
final_df.show(truncate=False)










# # # # Optional: Save the results to a file
# # # final_df.write.mode("overwrite").parquet("s3://group-bucket/q3_results.parquet")
# # # print("Results saved to s3://group-bucket/q3_results.parquet")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- Zip Code: integer (nullable = true)
 |-- Community: string (nullable = true)
 |-- Total_Income: double (nullable = true)

+--------------+------+----------+------------+------------+---------------------+----------------------+
|COMM          |ZCTA10|Population|Total_Income|Total_Crimes|Avg_Income_Per_Person|Crime_Per_Person_Ratio|
+--------------+------+----------+------------+------------+---------------------+----------------------+
|Downtown      |90014 |93        |23642.0     |1342        |254.21505376344086   |14.43010752688172     |
|East Hollywood|90029 |544       |33912.0     |200         |62.338235294117645   |0.36764705882352944   |
|Alsace        |90016 |457       |38330.0     |286         |83.87308533916848    |0.6258205689277899    |
|San Pedro     |90731 |2         |50879.0     |185         |25439.5              |92.5                  |
|Hollywood     |90038 |109       |33800.0     |98          |310.091743119266     |0.8990825688073395    |
|Mid-city      |9001