In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from sedona.register import SedonaRegistrator
from sedona.sql.types import GeometryType
from sedona.spark import *
from pyspark.sql.functions import regexp_replace,regexp_extract, col, split, explode, trim, avg
from pyspark.sql.types import DoubleType
import time

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2615,application_1732639283265_2574,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
def run_query_4(executor_cores, executor_memory):
   
    # initialize spark session
    spark = SparkSession.builder \
        .appName("IncomeCriminalityPerCommunity") \
        .master("yarn") \
        .config("spark.executor.instances", "2") \
        .config("spark.executor.cores", str(executor_cores)) \
        .config("spark.executor.memory", executor_memory) \
        .config("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator") \
        .getOrCreate()
    
    # register sedona user defined functions and types
    SedonaRegistrator.registerAll(spark)
    
    # create Sedona context
    sedona = SedonaContext.create(spark)
    
    start_time_df = time.time()
    #load the median income data
    la_income_2015_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
    #keep only the blocks that are in los angeles
    la_income_2015_df = la_income_2015_df.filter(la_income_2015_df.Community.startswith("Los Angeles"))
    #make the estimated median income an integer instead of a string
    la_income_2015_df = la_income_2015_df.withColumn(
        "Estimated Median Income",
        regexp_replace(col("Estimated Median Income"), "[$,]", "").cast(DoubleType())
    ).select(
        col("Zip Code").alias("zipcode"),
        col("Estimated Median Income")
    )

    
    
    #read the census data (geojson) and flatten arrays into rows
    census_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
    census_raw = sedona.read.format("geojson") \
                .option("multiLine", "true").load(census_path) \
                .selectExpr("explode(features) as features") \
                .select("features.*")      
    # turn each of the properties into columns and keep geometry
    census_data = census_raw.select( \
                    [col(f"properties.{col_name}").alias(col_name) for col_name in \
                    census_raw.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
                .drop("properties") \
                .drop("type")
    #keep the necessary columns zipcode, geometry, population and housing
    census_transformed = census_data.filter((col("CITY") == "Los Angeles")).select(
        col("ZCTA10").alias("zipcode"),
        col("geometry"),
        col("POP_2010"),
        col("COMM").alias("community"),
        col("HOUSING10")
    )
    #join the census dataset with the income dataset based on zipcodes
    census_income = census_transformed.join(
        la_income_2015_df, 
        on="zipcode",
        how="inner"
    )
    #group the data based on community,calculated the total population, the total income and the income per person
    census_income = census_income.groupBy("community").agg(
                sum("POP_2010").alias("population"),
                sum(col("HOUSING10") * col("Estimated Median Income")).alias("total income"),
                ST_Union_Aggr("geometry").alias("geometry")
            ).withColumn(
                    "income per person",
                    col("total income") / col("population")
            )
    #keep the geometries of the areas with the top and bottom 3 incomes per person
    top_3_census =  census_income.orderBy(desc("income per person")).select("geometry").limit(3)
    bottom_3_census =  census_income.orderBy(asc("income per person")).select("geometry").limit(3)
    
    #load the 2010-2019 crime data
    crimes = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
    #create a column for the years(extracting it from the date column)
    crimes = crimes.withColumn(
            "year",
            year(to_timestamp(col("DATE OCC"), "MM/dd/yyyy hh:mm:ss a"))
        )
    #keep only the crimes that took place in 2015
    crimes = crimes.where(col("year") == 2015)
    
    #keep only the necessary columns of victim descent,longtitude and latitude
    #also add a column that creates a coordinate value from longtitude and latitude for sedona
    crimes_with_geom = crimes.select(
        col("Vict Descent").alias("vict_descent"),
        col("LAT").alias("lat"),
        col("LON").alias("lon")
    ).withColumn(
        "crime_point",
        ST_Point(col("lon"), col("lat"))
    )
    # join the crimes and top 3 income location geometries by keeping the crimes that 
    # their location is contained in one of the geometry values of the top 3 income areas
    top_3_crimes_df = top_3_census.alias("t") \
        .join(
            crimes_with_geom.alias("c"),
            on=ST_Contains(col("t.geometry"), col("c.crime_point")),  # condition
            how="inner"
        )
    #similarly for the bottom 3 areas
    bottom_3_crimes_df = bottom_3_census.alias("b") \
        .join(
            crimes_with_geom.alias("c"),
            on=ST_Contains(col("b.geometry"), col("c.crime_point")),
            how="inner"
        )
    
    #keep only the victim descent from the collected crimes
    top_3_crimes_df = top_3_crimes_df.drop("geometry","crime_point","lat","lon")
    bottom_3_crimes_df = bottom_3_crimes_df.drop("geometry","crime_point","lat","lon")
    # group the crimes based on victim descent and count the amount of each
    top_3_grouped = top_3_crimes_df.groupBy("vict_descent").agg(count("*").alias("#"))
    bottom_3_grouped = bottom_3_crimes_df.groupBy("vict_descent").agg(count("*").alias("#"))
    #load the race and ethnicity codes dataset
    recodes_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv")
    #inner join the grouped crime data with the race and ethnicity codes to match the codes with the corresponding group name
    top_3_grouped_full = (
        top_3_grouped
        .join(
            recodes_df,
            on=(top_3_grouped["vict_descent"] == recodes_df["_c0"]),
            how="inner"
        )
    )
    bottom_3_grouped_full = (
        bottom_3_grouped
        .join(
            recodes_df,
            on=(top_3_grouped["vict_descent"] == recodes_df["_c0"]),
            how="inner"
        )
    )
    #drop the uncessary coulmns
    top_3_grouped_full = top_3_grouped_full.drop("_c0", "vict_descent")
    bottom_3_grouped_full = bottom_3_grouped_full.drop("_c0", "vict_descent")
    #order them in a descenting order
    top_3_grouped_full = top_3_grouped_full.orderBy(col("#").desc())
    bottom_3_grouped_full = bottom_3_grouped_full.orderBy(col("#").desc())
    #rename the victim descent column
    top_3_grouped_full = top_3_grouped_full.select(col("_c1").alias("Victim Descent"), col("#"))
    bottom_3_grouped_full = bottom_3_grouped_full.select(col("_c1").alias("Victim Descent"), col("#"))
    end_time_df = time.time()
    execution_time_df = end_time_df - start_time_df

    #printing the results
    print("The victim descent of the crimes commited in 2015 in the 3 areas with the highest income in los angeles:")
    top_3_grouped_full.show(truncate=False)
    print("The victim descent of the crimes commited in 2015 in the 3 areas with the lowest income in los angeles:")
    bottom_3_grouped_full.show(truncate=False)
    print(f"Total execution time: {execution_time_df:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
cores = 1
memory = "2g"
print(f"\nRunning query with {cores} core(s) and {memory} memory per executor...")
exec_time = run_query_4(cores, memory)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Running query with 1 core(s) and 2g memory per executor...
The victim descent of the crimes commited in 2015 in the 3 areas with the highest income in los angeles:
+------------------------------+---+
|Victim Descent                |#  |
+------------------------------+---+
|White                         |642|
|Other                         |83 |
|Hispanic/Latin/Mexican        |71 |
|Unknown                       |47 |
|Black                         |41 |
|Other Asian                   |22 |
|Chinese                       |1  |
|American Indian/Alaskan Native|1  |
+------------------------------+---+

The victim descent of the crimes commited in 2015 in the 3 areas with the lowest income in los angeles:
+------------------------------+----+
|Victim Descent                |#   |
+------------------------------+----+
|Hispanic/Latin/Mexican        |3342|
|Black                         |1127|
|White                         |428 |
|Other                         |252 |
|Other Asian        

In [3]:
cores = 2
memory = "4g"
print(f"\nRunning query with {cores} core(s) and {memory} memory per executor...")
exec_time = run_query_4(cores, memory)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Running query with 2 core(s) and 4g memory per executor...
The victim descent of the crimes commited in 2015 in the 3 areas with the highest income in los angeles:
+------------------------------+---+
|Victim Descent                |#  |
+------------------------------+---+
|White                         |642|
|Other                         |83 |
|Hispanic/Latin/Mexican        |71 |
|Unknown                       |47 |
|Black                         |41 |
|Other Asian                   |22 |
|American Indian/Alaskan Native|1  |
|Chinese                       |1  |
+------------------------------+---+

The victim descent of the crimes commited in 2015 in the 3 areas with the lowest income in los angeles:
+------------------------------+----+
|Victim Descent                |#   |
+------------------------------+----+
|Hispanic/Latin/Mexican        |3342|
|Black                         |1127|
|White                         |428 |
|Other                         |252 |
|Other Asian        

In [3]:
cores = 4
memory = "8g"
print(f"\nRunning query with {cores} core(s) and {memory} memory per executor...")
exec_time = run_query_4(cores, memory)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


Running query with 4 core(s) and 8g memory per executor...
The victim descent of the crimes commited in 2015 in the 3 areas with the highest income in los angeles:
+------------------------------+---+
|Victim Descent                |#  |
+------------------------------+---+
|White                         |642|
|Other                         |83 |
|Hispanic/Latin/Mexican        |71 |
|Unknown                       |47 |
|Black                         |41 |
|Other Asian                   |22 |
|American Indian/Alaskan Native|1  |
|Chinese                       |1  |
+------------------------------+---+

The victim descent of the crimes commited in 2015 in the 3 areas with the lowest income in los angeles:
+------------------------------+----+
|Victim Descent                |#   |
+------------------------------+----+
|Hispanic/Latin/Mexican        |3342|
|Black                         |1127|
|White                         |428 |
|Other                         |252 |
|Other Asian        