In [None]:
from pyspark.sql import SparkSession, functions as F
from sedona.register.geo_registrator import SedonaRegistrator
from sedona.spark import SedonaContext
from pyspark.sql.types import StringType

# Initialize Spark Session and Sedona Context
spark = SparkSession.builder \
    .appName("CrimeAnalysisSpatialJoinSQL") \
    .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
    .config("sedona.global.charset", "utf8") \
    .getOrCreate()

sedona = SedonaContext.create(spark)

# Load datasets
census_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
income_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
crime_path = "s3://groups-bucket-dblab-905418150721/group24/results/q2_parquet_maindata/"

# Load Census Data (GeoJSON format)
census_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(census_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

# Flatten the GeoJSON structure and filter valid populations
census_df = census_df.select(
    F.col("properties.ZCTA10").alias("ZCTA10"),
    F.col("properties.POP_2010").alias("Population"),
    F.col("properties.COMM").alias("COMM"),
    F.col("geometry").alias("geometry"),
    F.col("properties.HOUSING10").alias("Total_Households"),
).filter(F.col("Population") > 0)  # Exclude zero or negative population

# Load Crime Data (Parquet format)
crime_df = spark.read.parquet(crime_path)

# Create geometry column using ST_Point
crime_df = crime_df.withColumn("geometry", F.expr("ST_Point(LON, LAT)")) \
                   .select("geometry")

# Load Income Data (CSV format)
income_df = spark.read.csv(income_path, header=True, inferSchema=True)
income_df = income_df.withColumn(
    "Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
).withColumnRenamed("Zip Code","Zip").drop("Estimated Median Income")

census_income_joined = census_df.join(
    income_df,
    census_df["ZCTA10"] == income_df["Zip"],
    "inner"
).select(
    F.col("ZCTA10"),
    F.col("COMM").alias("Community"),
    F.col("Population"),
    F.col("Total_Households"),
    F.col("Income").alias("Median_Household_Income")
)

community_aggregates = census_income_joined.groupBy("Community").agg(
    F.sum("Population").alias("Total_Population"),
    F.sum("Total_Households").alias("Total_Households"),
    F.avg("Median_Household_Income").alias("Average_Median_Household_Income")
)
community_income_per_person = community_aggregates.withColumn(
    "Total_Income",
    F.col("Average_Median_Household_Income") * F.col("Total_Households")
).withColumn(
    "Mean_Income_Per_Person",
    F.col("Total_Income") / F.col("Total_Population")
)
crime_aggregates = crime_df.alias("cr").join(
    census_df.alias("c"),
    F.expr("ST_Within(cr.geometry, c.geometry)"),
    "inner"
).groupBy("c.COMM").agg(
    F.count("*").alias("Total_Crimes")
)
final_results = community_income_per_person.join(
    crime_aggregates,
    community_income_per_person["Community"] == crime_aggregates["COMM"],
    "left"
).select(
    F.col("Community"),
    F.col("Total_Population"),
    F.col("Mean_Income_Per_Person"),
    F.col("Total_Crimes"),
    (F.col("Total_Crimes") / F.col("Total_Population")).alias("Crime_Per_Person_Ratio")
)
final_results = final_results.fillna({"Total_Crimes": 0}).withColumn(
    "Crime_Per_Person_Ratio",
    F.when(F.col("Total_Population") > 0, F.col("Total_Crimes") / F.col("Total_Population")).otherwise(0)
)

final_results.show(150)
final_results.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [43]:
final_df.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

225