In [1]:
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, expr, count, regexp_replace, collect_list, first, trim
import time

# Initialize a Spark session with Sedona support for geospatial operations
spark = SparkSession.builder \
    .appName("Query_3") \
    .getOrCreate()

# Initialize Sedona context and register geospatial functions
sedona = SedonaContext.create(spark)
SedonaRegistrator.registerAll(spark)

# Load GeoJSON data for population and census blocks
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
population_df_1 = sedona.read.format("geojson") \
    .option("multiLine", "true").load(geojson_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

income_df_1 = spark.read.format("csv").option("header", "true").load("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv")

# Start a timer to measure execution time
start_time_df = time.time()

# Extract relevant columns from population GeoJSON and cast them to the appropriate types
population_df = population_df_1.select(
    col("properties.COMM").alias("area"),
    col("properties.POP_2010").cast("float").alias("population"),
    col("properties.ZCTA10").cast("float").alias("zip"),
    col("geometry").alias("geometry1"),
    col("properties.HOUSING10").alias("households")
)


# Aggregate population and household data by area and zip
population_aggregated_df = population_df.groupBy("area", "zip").agg(
    sum("population").cast("float").alias("total_population"),
    sum("households").cast("float").alias("total_housholds"),
    collect_list("geometry1").alias("geometry_array")
)

# Perform a geometric union of all geometries within each area and zip
population_geometry_combined = population_aggregated_df.select(
    "area",
    "zip",
    "total_population",
    ST_Union("geometry_array").alias("combined_geometry")
)

# Process income data: clean up income field and cast it to float
income_df = income_df_1.select(
    col("Community").alias("area"),
    col("Zip Code").alias("zip"),
    regexp_replace(col("Estimated Median Income"), r'[\$,]', '').cast("float").alias("household_income")
)

# Join population data with income data based on area and zip
joined1_df = population_aggregated_df.join(
    income_df,
    (income_df["area"].contains(population_aggregated_df["area"])) &
    (population_aggregated_df["zip"] == income_df["zip"]),
    how="inner").drop(income_df["area"])

# Ensure columns are properly cast for further calculations
joined1_df = joined1_df.withColumn(
    "total_population", col("total_population").cast("float")
).withColumn(
    "household_income", col("household_income").cast("float")
).withColumn(
    "total_housholds", col("total_housholds").cast("float")
)

# Calculate total household income and total population for each area
area_aggregated_df = joined1_df.groupBy("area").agg(
    sum(col("total_housholds") * col("household_income")).cast("float").alias("total_housholds_income"),
    sum("total_population").cast("float").alias("total_population_sum")
)

# Compute average income per person in each area
result1_df = area_aggregated_df.withColumn(
    "average_income_per_person",
    (col("total_housholds_income")) / col("total_population_sum")
)

# Load crime data and filter invalid geographical entries
crime_df = spark.read.format("csv").option("header", "true").load("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/")
crime_df = crime_df.filter((col("LAT") != 0) & (col("LON") != 0))
crime_df = crime_df.withColumn("geography", ST_Point("LON", "LAT"))
crime_df = crime_df.select(col("geography"))

# Spatial join between crimes and combined population geometries
joined2_df = crime_df.join(
    population_geometry_combined, ST_Within(crime_df.geography, population_geometry_combined.combined_geometry)
)

# Aggregate crime counts and total population for each area and zip
joined2_df = joined2_df.groupBy("area", "zip").agg(
    count("*").cast("float").alias("total_crimes"),
    first("total_population").cast("float").alias("total_population")
)

# Aggregate crime data by area
joined2_df = joined2_df.groupBy("area").agg(
    sum(col("total_crimes")).cast("float").alias("total_population_crime"),
    sum("total_population").cast("float").alias("total_population_sum")
)

# Calculate crime ratio for each area
result2_df = joined2_df.withColumn(
    "crime_ratio",
    (col("total_population_crime")) / col("total_population_sum")
)

# Combine crime data with average income data
final_df = result2_df.join(
    result1_df.select("area", "average_income_per_person"),
    on="area",
    how="inner"
).select("area", "crime_ratio", "average_income_per_person")

# Filter out rows with null or empty area values
final_df = final_df.filter(final_df["area"].isNotNull() & (trim(final_df["area"]) != ""))

# Display final results
final_df.orderBy(col("crime_ratio").desc()).show()

# Measure execution time for the entire DataFrame API process
execution_time_df = time.time() - start_time_df
print(f"Execution time for DataFrame API: {execution_time_df} seconds")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3141,application_1732639283265_3097,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+------------------+-------------------------+
|         area|       crime_ratio|average_income_per_person|
+-------------+------------------+-------------------------+
|       Vernon| 6.339285714285714|        4406.446428571428|
|     Downtown| 4.208144796380091|       19107.378746594004|
| Little Tokyo| 4.020673360897814|       23815.344594594593|
|    Hollywood|1.5511440107671601|        25648.05024674742|
|    Chinatown|1.2563501367721766|        14058.45616777387|
|  Rancho Park|1.2552819698173154|        38740.06417791898|
|       Venice|1.2549272030651342|         45826.1724004093|
| Leimert Park|1.2148193014334747|       17064.604636118598|
|Baldwin Hills|1.1940496560393896|       16899.742774277427|
|  Westchester|1.1896619947102067|       30705.685088633993|
|    Hyde Park| 1.083026160397953|       14103.120570456438|
| Park La Brea| 1.025207944321847|        36619.89882872178|
|     Westlake|1.0028551557524068|         10639.3003783415|
|        Watts|0.9950576