In [1]:
import time
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, desc, rank, sum as _sum
from pyspark.sql.types import *
from pyspark.sql.window import Window
from sedona.spark import *

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2779,application_1732639283265_2738,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Case No1 

2 executors
2g memory
1 core

In [2]:
# Start Spark Session
spark = SparkSession.builder \
    .appName("Query 4") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# paths
crime_data_2010_2019 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_2020 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
income = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
census_blocks = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
race_ethnicity_codes = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"

# Load datasets
crime_data = spark.read.csv(crime_data_2010_2019, header=True, inferSchema=True).union(
    spark.read.csv(crime_data_2020, header=True, inferSchema=True))
income_data = spark.read.csv(income, header=True, inferSchema=True, quote='"', escape='"')
census_data = spark.read.json(census_blocks)
race_codes_data = spark.read.csv(race_ethnicity_codes, header=True, inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
start_time_1 = time.time()

# Create sedona context
sedona = SedonaContext.create(spark)

blocks_data = sedona.read.format("geojson") \
            .option("multiLine", "true").load(census_blocks) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_data = blocks_data.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_data.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# 1. Normalize and Aggregate Population Data
# Aggregate population data by ZCTA10
population_data = flattened_data.select(
    col("ZCTA10").alias("Zip_Code"),
    col("POP_2010").alias("Population")
).groupBy("Zip_Code").agg(_sum("Population").alias("Total_Population"))
# population_df.show(5)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
from pyspark.sql.functions import regexp_replace, col

income_data = income_data.withColumnRenamed("Estimated Median Income", "Median_Income") \
                     .withColumnRenamed("Zip Code", "Zip_Code") \
                     .withColumn("Median_Income", regexp_replace(col("Median_Income"), "[$,]", "").cast("double"))

#filter only for communities in the los angeles municipality
income_data = income_data.filter(col("Community").contains("Los Angeles"))
# income_df.show(5)

# Join population and income data on Zip_Code
pop_income_data = population_data.join(income_data, on="Zip_Code", how="inner")
# pop_income_df.show(5)

# Calculate per capita income
income_population_data = pop_income_data.withColumn(
    "Per_Capita_Income",
    (col("Median_Income") / col("Total_Population"))
)

# Show the result
income_population_data.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+----------------+--------------------+-------------+------------------+
|Zip_Code|Total_Population|           Community|Median_Income| Per_Capita_Income|
+--------+----------------+--------------------+-------------+------------------+
|   91326|           33708|Los Angeles (Nort...|     101073.0|2.9984870060519757|
|   90094|            5464|Los Angeles (Play...|     104367.0| 19.10084187408492|
|   90026|           67869|Los Angeles (Echo...|      47993.0|0.7071416994504118|
|   90029|           38617|Los Angeles (East...|      33912.0|0.8781624673071445|
|   90067|            2424|Los Angeles (Cent...|      97891.0|40.384075907590756|
+--------+----------------+--------------------+-------------+------------------+
only showing top 5 rows

In [6]:
# Get the top 3 areas with the highest per capita income
top_areas_data = income_population_data.orderBy(col("Per_Capita_Income").desc()).limit(3)

# Show the result
top_areas_data.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+----------------+--------------------+-------------+------------------+
|Zip_Code|Total_Population|           Community|Median_Income| Per_Capita_Income|
+--------+----------------+--------------------+-------------+------------------+
|   90067|            2424|Los Angeles (Cent...|      97891.0|40.384075907590756|
|   90094|            5464|Los Angeles (Play...|     104367.0| 19.10084187408492|
|   90077|            9377|Los Angeles (Bel ...|     164281.0|17.519569158579504|
+--------+----------------+--------------------+-------------+------------------+

In [7]:
# Get the bottom 3 areas with the lowest per capita income
bottom_areas_data = income_population_data.orderBy(col("Per_Capita_Income").asc()).limit(3)

# Show the result
bottom_areas_data.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+----------------+--------------------+-------------+-------------------+
|Zip_Code|Total_Population|           Community|Median_Income|  Per_Capita_Income|
+--------+----------------+--------------------+-------------+-------------------+
|   90011|          103892|Los Angeles (Sout...|      30251.0|  0.291177376506372|
|   90044|           89779|Athens, Los Angel...|      29206.0| 0.3253099277113802|
|   90037|           62276|Los Angeles (Sout...|      27179.0|0.43642815852013617|
+--------+----------------+--------------------+-------------+-------------------+

In [8]:
# Aggregate geometries by ZIP code
zip_geometries = flattened_data.groupBy("ZCTA10").agg(
    ST_Union_Aggr("geometry").alias("geometry")
).withColumnRenamed("ZCTA10", "Zip_Code")

# Show the result
#zip_geometries.show(5)

# Create geometry column for crime_df
crime_data = crime_data.withColumn("geom", ST_Point("LON", "LAT"))

# Perform geospatial join
crime_data = crime_data.join(
    zip_geometries, 
    ST_Within(crime_data["geom"], zip_geometries["geometry"]),
    "inner"
)

#crime_df.show()

from pyspark.sql.functions import col

# Extract the year from the `Date Rptd` field and filter for 2015
crime_2015_data = crime_data.filter(col("Date Rptd").substr(7, 4) == "2015")

# Show the filtered data
#crime_2015_df.show(5)

# Join crime data with race codes
crime_race_data = crime_2015_data.join(race_codes_data, on="Vict Descent", how="inner")

# Show the joined result
crime_race_data.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+---------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+---------+--------+--------+---------+--------------------+--------------+-----------+------+-----------+--------+--------+--------+--------+--------------------+--------------------+-------+---------+--------------------+--------+--------------------+--------------------+
|Vict Descent|    DR_NO|           Date Rptd|            DATE OCC|TIME OCC|AREA |AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|  Mocodes|Vict Age|Vict Sex|Premis Cd|         Premis Desc|Weapon Used Cd|Weapon Desc|Status|Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|        Cross Street|    LAT|      LON|                geom|Zip_Code|            geometry|   Vict Descent Full|
+------------+---------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+---------+--------+--------+----

The substr function extracts characters from a string column:
substr(7, 4) means "start at the 7th character and take the next 4 characters" (which corresponds to the year in MM/DD/YYYY format).
The filter then checks if the extracted year equals 2015.

In [9]:
# Top areas racial profile
top_racial_profile_data = crime_race_data.join(
    top_areas_data, on="Zip_Code", how="inner"
).groupBy("Vict Descent Full").agg(
    count("*").alias("Total_Victims")
).orderBy(col("Total_Victims").desc())

# Bottom areas racial profile
bottom_racial_profile_data = crime_race_data.join(
    bottom_areas_data, on="Zip_Code", how="inner"
).groupBy("Vict Descent Full").agg(
    count("*").alias("Total_Victims")
).orderBy(col("Total_Victims").desc())

# Show results
print(f"Top areas racial profile")
top_racial_profile_data.show()
print(f"Bottom areas racial profile")
bottom_racial_profile_data.show()

end_time_1 = time.time()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Top areas racial profile
+--------------------+-------------+
|   Vict Descent Full|Total_Victims|
+--------------------+-------------+
|               White|          272|
|               Other|           97|
|Hispanic/Latin/Me...|           45|
|               Black|           29|
|             Unknown|           20|
|         Other Asian|           17|
|            Filipino|            1|
+--------------------+-------------+

Bottom areas racial profile
+--------------------+-------------+
|   Vict Descent Full|Total_Victims|
+--------------------+-------------+
|Hispanic/Latin/Me...|         7239|
|               Black|         4572|
|               Other|          412|
|               White|          363|
|         Other Asian|           69|
|             Unknown|           54|
|            Filipino|            3|
|            Japanese|            2|
|             Chinese|            2|
|              Korean|            2|
|           Guamanian|            1|
|American Indian/A...

In [10]:
elapsed_time_2_2_1 = end_time_1 - start_time_1
print(f"2 executors 2g memory 1 core: {elapsed_time_2_2_1:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2 executors 2g memory 1 core: 174.58 seconds

## Case No2 

2 executors
4g memory
2 core

In [11]:
# Start Spark Session
spark = SparkSession.builder \
    .appName("Query 4") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
# paths
crime_data_2010_2019 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_2020 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
income = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
race_ethnicity_codes = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"

# Load datasets
crime_data = spark.read.csv(crime_data_2010_2019, header=True, inferSchema=True).union(
    spark.read.csv(crime_data_2020, header=True, inferSchema=True))
income_data = spark.read.csv(income, header=True, inferSchema=True, quote='"', escape='"')
race_codes_data = spark.read.csv(race_ethnicity_codes, header=True, inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
start_time_2 = time.time()

# Create sedona context
sedona = SedonaContext.create(spark)

blocks_data = sedona.read.format("geojson") \
            .option("multiLine", "true").load(census_blocks) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_data = blocks_data.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_data.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# 1. Normalize and Aggregate Population Data
# Aggregate population data by ZCTA10
population_data = flattened_data.select(
    col("ZCTA10").alias("Zip_Code"),
    col("POP_2010").alias("Population")
).groupBy("Zip_Code").agg(_sum("Population").alias("Total_Population"))
# population_df.show(5)

from pyspark.sql.functions import regexp_replace, col

income_data = income_data.withColumnRenamed("Estimated Median Income", "Median_Income") \
                     .withColumnRenamed("Zip Code", "Zip_Code") \
                     .withColumn("Median_Income", regexp_replace(col("Median_Income"), "[$,]", "").cast("double"))

#filter only for communities in the los angeles municipality
income_data = income_data.filter(col("Community").contains("Los Angeles"))
# income_df.show(5)

# Join population and income data on Zip_Code
pop_income_data = population_data.join(income_data, on="Zip_Code", how="inner")
# pop_income_df.show(5)

# Calculate per capita income
income_population_data = pop_income_data.withColumn(
    "Per_Capita_Income",
    (col("Median_Income") / col("Total_Population"))
)

# Show the result
income_population_data.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+----------------+--------------------+-------------+------------------+
|Zip_Code|Total_Population|           Community|Median_Income| Per_Capita_Income|
+--------+----------------+--------------------+-------------+------------------+
|   91326|           33708|Los Angeles (Nort...|     101073.0|2.9984870060519757|
|   90094|            5464|Los Angeles (Play...|     104367.0| 19.10084187408492|
|   90026|           67869|Los Angeles (Echo...|      47993.0|0.7071416994504118|
|   90029|           38617|Los Angeles (East...|      33912.0|0.8781624673071445|
|   90067|            2424|Los Angeles (Cent...|      97891.0|40.384075907590756|
+--------+----------------+--------------------+-------------+------------------+
only showing top 5 rows

In [14]:
# Get the top 3 areas with the highest per capita income
top_areas_data = income_population_data.orderBy(col("Per_Capita_Income").desc()).limit(3)

# Show the result
top_areas_data.show()

# Get the bottom 3 areas with the lowest per capita income
bottom_areas_data = income_population_data.orderBy(col("Per_Capita_Income").asc()).limit(3)

# Show the result
bottom_areas_data.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+----------------+--------------------+-------------+------------------+
|Zip_Code|Total_Population|           Community|Median_Income| Per_Capita_Income|
+--------+----------------+--------------------+-------------+------------------+
|   90067|            2424|Los Angeles (Cent...|      97891.0|40.384075907590756|
|   90094|            5464|Los Angeles (Play...|     104367.0| 19.10084187408492|
|   90077|            9377|Los Angeles (Bel ...|     164281.0|17.519569158579504|
+--------+----------------+--------------------+-------------+------------------+

+--------+----------------+--------------------+-------------+-------------------+
|Zip_Code|Total_Population|           Community|Median_Income|  Per_Capita_Income|
+--------+----------------+--------------------+-------------+-------------------+
|   90011|          103892|Los Angeles (Sout...|      30251.0|  0.291177376506372|
|   90044|           89779|Athens, Los Angel...|      29206.0| 0.3253099277113802|
|   90037|

In [15]:
# Aggregate geometries by ZIP code
zip_geometries = flattened_data.groupBy("ZCTA10").agg(
    ST_Union_Aggr("geometry").alias("geometry")
).withColumnRenamed("ZCTA10", "Zip_Code")

# Show the result
#zip_geometries.show(5)

# Create geometry column for crime_df
crime_data = crime_data.withColumn("geom", ST_Point("LON", "LAT"))

# Perform geospatial join
crime_data = crime_data.join(
    zip_geometries, 
    ST_Within(crime_data["geom"], zip_geometries["geometry"]),
    "inner"
)

#crime_df.show()

from pyspark.sql.functions import col

# Extract the year from the `Date Rptd` field and filter for 2015
crime_2015_data = crime_data.filter(col("Date Rptd").substr(7, 4) == "2015")

# Show the filtered data
#crime_2015_df.show(5)

# Join crime data with race codes
crime_race_data = crime_2015_data.join(race_codes_data, on="Vict Descent", how="inner")

# Show the joined result
crime_race_data.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+---------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------+--------+--------+---------+--------------------+--------------+-----------+------+------------+--------+--------+--------+--------+--------------------+--------------------+-------+---------+--------------------+--------+--------------------+-----------------+
|Vict Descent|    DR_NO|           Date Rptd|            DATE OCC|TIME OCC|AREA |AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|       Mocodes|Vict Age|Vict Sex|Premis Cd|         Premis Desc|Weapon Used Cd|Weapon Desc|Status| Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|        Cross Street|    LAT|      LON|                geom|Zip_Code|            geometry|Vict Descent Full|
+------------+---------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------+--------+--

In [16]:
# Top areas racial profile
top_racial_profile_data = crime_race_data.join(
    top_areas_data, on="Zip_Code", how="inner"
).groupBy("Vict Descent Full").agg(
    count("*").alias("Total_Victims")
).orderBy(col("Total_Victims").desc())

# Bottom areas racial profile
bottom_racial_profile_data = crime_race_data.join(
    bottom_areas_data, on="Zip_Code", how="inner"
).groupBy("Vict Descent Full").agg(
    count("*").alias("Total_Victims")
).orderBy(col("Total_Victims").desc())

# Show results
print(f"Top areas racial profile")
top_racial_profile_data.show()
print(f"Bottom areas racial profile")
bottom_racial_profile_data.show()

end_time_2 = time.time()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Top areas racial profile
+--------------------+-------------+
|   Vict Descent Full|Total_Victims|
+--------------------+-------------+
|               White|          272|
|               Other|           97|
|Hispanic/Latin/Me...|           45|
|               Black|           29|
|             Unknown|           20|
|         Other Asian|           17|
|            Filipino|            1|
+--------------------+-------------+

Bottom areas racial profile
+--------------------+-------------+
|   Vict Descent Full|Total_Victims|
+--------------------+-------------+
|Hispanic/Latin/Me...|         7239|
|               Black|         4572|
|               Other|          412|
|               White|          363|
|         Other Asian|           69|
|             Unknown|           54|
|            Filipino|            3|
|             Chinese|            2|
|            Japanese|            2|
|              Korean|            2|
|           Guamanian|            1|
|American Indian/A...

In [17]:
elapsed_time_2_4_2 = end_time_2 - start_time_2
print(f"2 executors 4g memory 2 core: {elapsed_time_2_4_2:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2 executors 4g memory 2 core: 119.00 seconds

## Case No3 

2 executors
8g memory
4 core

In [18]:
# Start Spark Session
spark = SparkSession.builder \
    .appName("Query 4") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
# paths
crime_data_2010_2019 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_2020 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
income = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
race_ethnicity_codes = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"

# Load datasets
crime_data = spark.read.csv(crime_data_2010_2019, header=True, inferSchema=True).union(
    spark.read.csv(crime_data_2020, header=True, inferSchema=True))
income_data = spark.read.csv(income, header=True, inferSchema=True, quote='"', escape='"')
race_codes_data = spark.read.csv(race_ethnicity_codes, header=True, inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
start_time_3 = time.time()

# Create sedona context
sedona = SedonaContext.create(spark)

blocks_data = sedona.read.format("geojson") \
            .option("multiLine", "true").load(census_blocks) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_data = blocks_data.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_data.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# 1. Normalize and Aggregate Population Data
# Aggregate population data by ZCTA10
population_data = flattened_data.select(
    col("ZCTA10").alias("Zip_Code"),
    col("POP_2010").alias("Population")
).groupBy("Zip_Code").agg(_sum("Population").alias("Total_Population"))
# population_df.show(5)

from pyspark.sql.functions import regexp_replace, col

income_data = income_data.withColumnRenamed("Estimated Median Income", "Median_Income") \
                     .withColumnRenamed("Zip Code", "Zip_Code") \
                     .withColumn("Median_Income", regexp_replace(col("Median_Income"), "[$,]", "").cast("double"))

#filter only for communities in the los angeles municipality
income_data = income_data.filter(col("Community").contains("Los Angeles"))
# income_df.show(5)

# Join population and income data on Zip_Code
pop_income_data = population_data.join(income_data, on="Zip_Code", how="inner")
# pop_income_df.show(5)

# Calculate per capita income
income_population_data = pop_income_data.withColumn(
    "Per_Capita_Income",
    (col("Median_Income") / col("Total_Population"))
)

# Show the result
income_population_data.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+----------------+--------------------+-------------+------------------+
|Zip_Code|Total_Population|           Community|Median_Income| Per_Capita_Income|
+--------+----------------+--------------------+-------------+------------------+
|   91326|           33708|Los Angeles (Nort...|     101073.0|2.9984870060519757|
|   90094|            5464|Los Angeles (Play...|     104367.0| 19.10084187408492|
|   90026|           67869|Los Angeles (Echo...|      47993.0|0.7071416994504118|
|   90029|           38617|Los Angeles (East...|      33912.0|0.8781624673071445|
|   90067|            2424|Los Angeles (Cent...|      97891.0|40.384075907590756|
+--------+----------------+--------------------+-------------+------------------+
only showing top 5 rows

In [21]:
# Get the top 3 areas with the highest per capita income
top_areas_data = income_population_data.orderBy(col("Per_Capita_Income").desc()).limit(3)

# Show the result
top_areas_data.show()

# Get the bottom 3 areas with the lowest per capita income
bottom_areas_data = income_population_data.orderBy(col("Per_Capita_Income").asc()).limit(3)

# Show the result
bottom_areas_data.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+----------------+--------------------+-------------+------------------+
|Zip_Code|Total_Population|           Community|Median_Income| Per_Capita_Income|
+--------+----------------+--------------------+-------------+------------------+
|   90067|            2424|Los Angeles (Cent...|      97891.0|40.384075907590756|
|   90094|            5464|Los Angeles (Play...|     104367.0| 19.10084187408492|
|   90077|            9377|Los Angeles (Bel ...|     164281.0|17.519569158579504|
+--------+----------------+--------------------+-------------+------------------+

+--------+----------------+--------------------+-------------+-------------------+
|Zip_Code|Total_Population|           Community|Median_Income|  Per_Capita_Income|
+--------+----------------+--------------------+-------------+-------------------+
|   90011|          103892|Los Angeles (Sout...|      30251.0|  0.291177376506372|
|   90044|           89779|Athens, Los Angel...|      29206.0| 0.3253099277113802|
|   90037|

In [22]:
# Aggregate geometries by ZIP code
zip_geometries = flattened_data.groupBy("ZCTA10").agg(
    ST_Union_Aggr("geometry").alias("geometry")
).withColumnRenamed("ZCTA10", "Zip_Code")

# Show the result
#zip_geometries.show(5)

# Create geometry column for crime_df
crime_data = crime_data.withColumn("geom", ST_Point("LON", "LAT"))

# Perform geospatial join
crime_data = crime_data.join(
    zip_geometries, 
    ST_Within(crime_data["geom"], zip_geometries["geometry"]),
    "inner"
)

#crime_df.show()

from pyspark.sql.functions import col

# Extract the year from the `Date Rptd` field and filter for 2015
crime_2015_data = crime_data.filter(col("Date Rptd").substr(7, 4) == "2015")

# Show the filtered data
#crime_2015_df.show(5)

# Join crime data with race codes
crime_race_data = crime_2015_data.join(race_codes_data, on="Vict Descent", how="inner")

# Show the joined result
crime_race_data.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+---------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------------+--------+--------+---------+--------------------+--------------+-----------+------+-----------+--------+--------+--------+--------+--------------------+------------+-------+---------+--------------------+--------+--------------------+-----------------+
|Vict Descent|    DR_NO|           Date Rptd|            DATE OCC|TIME OCC|AREA |AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|             Mocodes|Vict Age|Vict Sex|Premis Cd|         Premis Desc|Weapon Used Cd|Weapon Desc|Status|Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Street|    LAT|      LON|                geom|Zip_Code|            geometry|Vict Descent Full|
+------------+---------+--------------------+--------------------+--------+-----+---------+-----------+--------+------+--------------------+--------------------+--------+--

In [23]:
# Top areas racial profile
top_racial_profile_data = crime_race_data.join(
    top_areas_data, on="Zip_Code", how="inner"
).groupBy("Vict Descent Full").agg(
    count("*").alias("Total_Victims")
).orderBy(col("Total_Victims").desc())

# Bottom areas racial profile
bottom_racial_profile_data = crime_race_data.join(
    bottom_areas_data, on="Zip_Code", how="inner"
).groupBy("Vict Descent Full").agg(
    count("*").alias("Total_Victims")
).orderBy(col("Total_Victims").desc())

# Show results
print(f"Top areas racial profile")
top_racial_profile_data.show()
print(f"Bottom areas racial profile")
bottom_racial_profile_data.show()

end_time_3 = time.time()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Top areas racial profile
+--------------------+-------------+
|   Vict Descent Full|Total_Victims|
+--------------------+-------------+
|               White|          272|
|               Other|           97|
|Hispanic/Latin/Me...|           45|
|               Black|           29|
|             Unknown|           20|
|         Other Asian|           17|
|            Filipino|            1|
+--------------------+-------------+

Bottom areas racial profile
+--------------------+-------------+
|   Vict Descent Full|Total_Victims|
+--------------------+-------------+
|Hispanic/Latin/Me...|         7239|
|               Black|         4572|
|               Other|          412|
|               White|          363|
|         Other Asian|           69|
|             Unknown|           54|
|            Filipino|            3|
|              Korean|            2|
|            Japanese|            2|
|             Chinese|            2|
|American Indian/A...|            1|
|           Guamanian

In [24]:
elapsed_time_2_8_4 = end_time_3 - start_time_3
print(f"2 executors 8g memory 4 core: {elapsed_time_2_8_4:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2 executors 8g memory 4 core: 117.77 seconds

## Comparison 


In [25]:
print(f"2 executors 2g memory 1 core: {elapsed_time_2_2_1:.2f} seconds")
print(f"2 executors 4g memory 2 core: {elapsed_time_2_4_2:.2f} seconds")
print(f"2 executors 8g memory 4 core: {elapsed_time_2_8_4:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2 executors 2g memory 1 core: 174.58 seconds
2 executors 4g memory 2 core: 119.00 seconds
2 executors 8g memory 4 core: 117.77 seconds

In [26]:
# Find the minimum elapsed time
min_elapsed_time = min(elapsed_time_2_2_1, elapsed_time_2_4_2, elapsed_time_2_8_4)

# Print the minimum elapsed time
print(f"The minimum elapsed time is:  {min_elapsed_time}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

The minimum elapsed time is:  117.77460598945618