In [1]:
from pyspark.sql.functions import col, count, sum as _sum, collect_list, coalesce, lit, expr
from sedona.spark import *
from pyspark.sql import SparkSession
import time

# Function to create Spark session with specific configuration
def create_spark_session(executor_cores, executor_memory):
    return SparkSession.builder \
        .appName(f"Query4 - {executor_cores} cores, {executor_memory} memory") \
        .config("spark.executor.instances", 2) \
        .config("spark.executor.cores", executor_cores) \
        .config("spark.executor.memory", executor_memory) \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()

# Function to execute the main processing logic
def main_processing(spark, config):
    # Start timing
    start_time = time.time()

    # Create Sedona Context
    sedona = SedonaContext.create(spark)

    # Load the census GeoJSON dataset
    geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
    blocks_df = sedona.read.format("geojson") \
        .option("multiLine", "true").load(geojson_path) \
        .selectExpr("explode(features) as features") \
        .select("features.*")

    # Flatten GeoJSON properties
    flattened_df = blocks_df.select(
        [col(f"properties.{col_name}").alias(col_name) for col_name in
         blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]
    ).drop("properties").drop("type")

    # Filter for Los Angeles City
    flattened_df = flattened_df.filter(col("CITY") == "Los Angeles")

    # Load crime datasets
    crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
    crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"

    data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True)
    data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)

    # Standardize column names (trim spaces)
    data_1 = data_1.select([col(c).alias(c.strip()) for c in data_1.columns])
    data_2 = data_2.select([col(c).alias(c.strip()) for c in data_2.columns])

    # Combine datasets
    combined_crime_df = data_1.unionByName(data_2)

    # Remove rows with empty Vict Descent
    combined_crime_df = combined_crime_df.filter((col("Vict Descent").isNotNull()) & (col("Vict Descent") != ""))

    # Create geometry points for crimes
    combined_crime_df = combined_crime_df.withColumn(
        "geom", ST_Point(col("LON"), col("LAT"))
    ).dropna(subset=["LON", "LAT"])

    # Perform spatial join to assign COMM to each crime
    crimes_with_comm = combined_crime_df.join(
        flattened_df, ST_Within(combined_crime_df.geom, flattened_df.geometry), "inner"
    ).select("COMM", "Vict Descent", "DR_NO")

    # Load income dataset
    income_dataset_path = "s3://groups-bucket-dblab-905418150721/group7/q3_results/median_income_per_person/part-00000-f13c9655-4e09-47ad-b0e7-080964d8ab97-c000.csv"
    income_df = spark.read.csv(income_dataset_path, header=True, inferSchema=True)

    # Get 3 COMM with highest and lowest income
    top_3_comm = income_df.orderBy(col("Median Income Per Person").desc()).limit(3).select("COMM")
    bottom_3_comm = income_df.orderBy(col("Median Income Per Person").asc()).limit(3).select("COMM")

    # Filter crimes for top and bottom 3 COMM
    top_3_crimes = crimes_with_comm.join(top_3_comm, "COMM", "inner")
    bottom_3_crimes = crimes_with_comm.join(bottom_3_comm, "COMM", "inner")

    # Load ethnicity codes
    ethnicity_codes_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
    ethnicity_df = spark.read.csv(ethnicity_codes_path, header=True, inferSchema=True)

    # Add ethnicity descriptions
    top_3_crimes = top_3_crimes.join(ethnicity_df, "Vict Descent", "left")
    bottom_3_crimes = bottom_3_crimes.join(ethnicity_df, "Vict Descent", "left")

    # Aggregate racial profile for top 3 COMM
    top_3_racial_profile = top_3_crimes.groupBy("COMM", "Vict Descent Full").agg(count("DR_NO").alias("victim_count"))

    # Aggregate racial profile for bottom 3 COMM
    bottom_3_racial_profile = bottom_3_crimes.groupBy("COMM", "Vict Descent Full").agg(count("DR_NO").alias("victim_count"))

    # Aggregate total victims per race/ethnicity and gather contributing COMM areas for top 3 COMM
    final_top_3_racial_profile = top_3_racial_profile.groupBy("Vict Descent Full") \
        .agg(
            _sum("victim_count").alias("total_victims"),
            collect_list("COMM").alias("comm_contributors")
        ).withColumnRenamed("Vict Descent Full", "vict_descent")

    # Extract individual COMM counts for top 3
    final_top_3_racial_profile = final_top_3_racial_profile.select(
        col("vict_descent"),
        col("total_victims"),
        col("comm_contributors")[0].alias("comm1"),
        coalesce(col("comm_contributors")[1], lit("N/A")).alias("comm2"),
        coalesce(col("comm_contributors")[2], lit("N/A")).alias("comm3")
    )
    final_top_3_racial_profile = final_top_3_racial_profile.orderBy(col("total_victims").desc())


    # Repeat for bottom 3 COMM
    final_bottom_3_racial_profile = bottom_3_racial_profile.groupBy("Vict Descent Full") \
        .agg(
            _sum("victim_count").alias("total_victims"),
            collect_list("COMM").alias("comm_contributors")
        ).withColumnRenamed("Vict Descent Full", "vict_descent")

    # Extract individual COMM counts for bottom 3
    final_bottom_3_racial_profile = final_bottom_3_racial_profile.select(
        col("vict_descent"),
        col("total_victims"),
        col("comm_contributors")[0].alias("comm1"),
        coalesce(col("comm_contributors")[1], lit("N/A")).alias("comm2"),
        coalesce(col("comm_contributors")[2], lit("N/A")).alias("comm3")
    )
    final_bottom_3_racial_profile = final_bottom_3_racial_profile.orderBy(col("total_victims").desc())

    # Save execution time
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")

    # Save results to separate paths
    output_path_final_top = "s3://groups-bucket-dblab-905418150721/group7/q4_results/final_top_3_racial_profile"
    output_path_final_bottom = "s3://groups-bucket-dblab-905418150721/group7/q4_results/final_bottom_3_racial_profile"

    final_top_3_racial_profile.write.option("header", True).mode("overwrite").csv(f"{output_path_final_top}/{config}")
    final_bottom_3_racial_profile.write.option("header", True).mode("overwrite").csv(f"{output_path_final_bottom}/{config}")

    # Show results
    print("Final Top 3 COMM Racial Profile")
    final_top_3_racial_profile.show(truncate=False)

    print("Final Bottom 3 COMM Racial Profile")
    final_bottom_3_racial_profile.show(truncate=False)
    
    return execution_time

# Test configurations
config =  {"cores": 4, "memory": "8g"}
    #{"cores": 1, "memory": "2g"}
    #{"cores": 2, "memory": "4g"}

                                                                                     
# Run for each configuration
print(f"Testing configuration: {config['cores']} cores, {config['memory']} memory")
spark = create_spark_session(config['cores'], config['memory'])
execution_time = main_processing(spark,f"{config['cores']}")
print(f"Configuration {config['cores']} cores, {config['memory']} memory executed in {execution_time} seconds")
spark.stop()    


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2562,application_1732639283265_2521,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Testing configuration: 4 cores, 8g memory
Execution time: 38.301408767700195 seconds
Final Top 3 COMM Racial Profile
+------------------------------+-------------+-------------------+-------------------+----------------+
|vict_descent                  |total_victims|comm1              |comm2              |comm3           |
+------------------------------+-------------+-------------------+-------------------+----------------+
|White                         |8429         |Pacific Palisades  |Palisades Highlands|Marina Peninsula|
|Other                         |1125         |Palisades Highlands|Pacific Palisades  |Marina Peninsula|
|Hispanic/Latin/Mexican        |868          |Palisades Highlands|Pacific Palisades  |Marina Peninsula|
|Unknown                       |651          |Pacific Palisades  |Palisades Highlands|Marina Peninsula|
|Black                         |462          |Pacific Palisades  |Palisades Highlands|Marina Peninsula|
|Other Asian                   |314          |Palis

Testing configuration: 1 cores, 2g memory
Execution time: 86.27664875984192 seconds

Testing configuration: 2 cores, 4g memory
Execution time: 25.129631757736206 seconds

Testing configuration: 4 cores, 8g memory
Execution time: 38.301408767700195 seconds