In [1]:
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count, desc, regexp_replace, expr
import time

# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("Query 4 - Racial Profile Analysis") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .getOrCreate()

# Create Sedona context
sedona = SedonaContext.create(spark)
SedonaRegistrator.registerAll(spark)

# Start timing
start_time = time.time()

# Load GeoJSON data for Census Blocks
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = spark.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_df = blocks_df.select(
                [col(f"properties.{col_name}").alias(col_name) for col_name in 
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# Load datasets
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
income_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
race_codes = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True) \
            .withColumnRenamed("Vict Descent", "Race_Code") \
            .withColumnRenamed("Vict Descent Full", "Race_Description")

# Load population data from GeoJSON
population_df = blocks_df.select(
    col("properties.ZCTA10").cast("string").alias("Zip Code"),
    col("properties.POP_2010").cast("float").alias("population"),
    col("properties.HOUSING10").cast("float").alias("households")
)

# Aggregate population and household data by ZIP code
population_aggregated_df = population_df.groupBy("Zip Code").agg(
    sum("population").alias("total_population"),
    sum("households").alias("total_households")
)

# Process income data
income_data = income_data.select(
    col("Zip Code").cast("string"),
    regexp_replace(col("Estimated Median Income"), r'[\$,]', '').cast("float").alias("household_income")
)

# Join population and income data by ZIP code
joined_df = population_aggregated_df.join(income_data, "Zip Code", "inner")

# Compute per capita income
joined_df = joined_df.withColumn(
    "per_capita_income", (col("total_households") * col("household_income")) / col("total_population")
)

# Identify the 3 ZIP codes with highest and lowest per capita income
high_income_zips = joined_df.orderBy(desc("per_capita_income")).limit(3)
low_income_zips = joined_df.orderBy("per_capita_income").limit(3)

# Broadcast high-income and low-income ZIP codes
high_income_zip_list = [row["Zip Code"] for row in high_income_zips.collect()]
low_income_zip_list = [row["Zip Code"] for row in low_income_zips.collect()]

# Filter census blocks for high- and low-income areas
high_income_blocks = flattened_df.filter(flattened_df["ZCTA10"].isin(high_income_zip_list))
low_income_blocks = flattened_df.filter(flattened_df["ZCTA10"].isin(low_income_zip_list))

# Filter crime data for 2015
crime_2015 = crime_data.filter(col("DATE OCC").contains("2015"))

# Remove invalid records (Null Island)
crime_2015 = crime_2015.filter((col("LAT").isNotNull()) & (col("LON").isNotNull()) & (col("LAT") != 0) & (col("LON") != 0))

# Add geometry column to crime data
crime_2015 = crime_2015.withColumn("geometry", ST_Point(col("LON"), col("LAT")))

# Join crime data with high- and low-income census blocks using spatial join
high_income_crimes = crime_2015.join(
    high_income_blocks,
    ST_Within(crime_2015.geometry, high_income_blocks.geometry),
    "inner"
)

low_income_crimes = crime_2015.join(
    low_income_blocks,
    ST_Within(crime_2015.geometry, low_income_blocks.geometry),
    "inner"
)

# Map racial profile codes to full descriptions
high_income_racial_profile = high_income_crimes \
    .join(race_codes, high_income_crimes["Vict Descent"] == race_codes["Race_Code"], "left") \
    .filter(col("Race_Description").isNotNull()) \
    .groupBy("Race_Description") \
    .agg(count("Vict Descent").alias("Victim_Count")) \
    .orderBy(desc("Victim_Count")) \
    .withColumnRenamed("Race_Description", "Victim Descent") \
    .withColumnRenamed("Victim_Count", "#")

low_income_racial_profile = low_income_crimes \
    .join(race_codes, low_income_crimes["Vict Descent"] == race_codes["Race_Code"], "left") \
    .filter(col("Race_Description").isNotNull()) \
    .groupBy("Race_Description") \
    .agg(count("Vict Descent").alias("Victim_Count")) \
    .orderBy(desc("Victim_Count")) \
    .withColumnRenamed("Race_Description", "Victim Descent") \
    .withColumnRenamed("Victim_Count", "#")

# Show results
print("High-Income Areas:")
high_income_racial_profile.show()

print("Low-Income Areas:")
low_income_racial_profile.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
21,application_1738075734771_0022,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

High-Income Areas:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|580|
|               Other| 85|
|Hispanic/Latin/Me...| 60|
|             Unknown| 51|
|               Black| 38|
|         Other Asian| 19|
|             Chinese|  1|
|American Indian/A...|  1|
+--------------------+---+

Low-Income Areas:
+--------------------+----+
|      Victim Descent|   #|
+--------------------+----+
|Hispanic/Latin/Me...|3941|
|               Black|1170|
|               White| 367|
|               Other| 325|
|         Other Asian|  45|
|             Unknown|  14|
|              Korean|   6|
|American Indian/A...|   2|
|            Filipino|   1|
|           Guamanian|   1|
|    Pacific Islander|   1|
+--------------------+----+

Time taken: 70.19 seconds

In [2]:
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count, desc, regexp_replace, expr
import time

# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("Query 4 - Racial Profile Analysis") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()

# Create Sedona context
sedona = SedonaContext.create(spark)
SedonaRegistrator.registerAll(spark)

# Start timing
start_time = time.time()

# Load GeoJSON data for Census Blocks
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = spark.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_df = blocks_df.select(
                [col(f"properties.{col_name}").alias(col_name) for col_name in 
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# Load datasets
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
income_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
race_codes = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True) \
            .withColumnRenamed("Vict Descent", "Race_Code") \
            .withColumnRenamed("Vict Descent Full", "Race_Description")

# Load population data from GeoJSON
population_df = blocks_df.select(
    col("properties.ZCTA10").cast("string").alias("Zip Code"),
    col("properties.POP_2010").cast("float").alias("population"),
    col("properties.HOUSING10").cast("float").alias("households")
)

# Aggregate population and household data by ZIP code
population_aggregated_df = population_df.groupBy("Zip Code").agg(
    sum("population").alias("total_population"),
    sum("households").alias("total_households")
)

# Process income data
income_data = income_data.select(
    col("Zip Code").cast("string"),
    regexp_replace(col("Estimated Median Income"), r'[\$,]', '').cast("float").alias("household_income")
)

# Join population and income data by ZIP code
joined_df = population_aggregated_df.join(income_data, "Zip Code", "inner")

# Compute per capita income
joined_df = joined_df.withColumn(
    "per_capita_income", (col("total_households") * col("household_income")) / col("total_population")
)

# Identify the 3 ZIP codes with highest and lowest per capita income
high_income_zips = joined_df.orderBy(desc("per_capita_income")).limit(3)
low_income_zips = joined_df.orderBy("per_capita_income").limit(3)

# Broadcast high-income and low-income ZIP codes
high_income_zip_list = [row["Zip Code"] for row in high_income_zips.collect()]
low_income_zip_list = [row["Zip Code"] for row in low_income_zips.collect()]

# Filter census blocks for high- and low-income areas
high_income_blocks = flattened_df.filter(flattened_df["ZCTA10"].isin(high_income_zip_list))
low_income_blocks = flattened_df.filter(flattened_df["ZCTA10"].isin(low_income_zip_list))

# Filter crime data for 2015
crime_2015 = crime_data.filter(col("DATE OCC").contains("2015"))

# Remove invalid records (Null Island)
crime_2015 = crime_2015.filter((col("LAT").isNotNull()) & (col("LON").isNotNull()) & (col("LAT") != 0) & (col("LON") != 0))

# Add geometry column to crime data
crime_2015 = crime_2015.withColumn("geometry", ST_Point(col("LON"), col("LAT")))

# Join crime data with high- and low-income census blocks using spatial join
high_income_crimes = crime_2015.join(
    high_income_blocks,
    ST_Within(crime_2015.geometry, high_income_blocks.geometry),
    "inner"
)

low_income_crimes = crime_2015.join(
    low_income_blocks,
    ST_Within(crime_2015.geometry, low_income_blocks.geometry),
    "inner"
)

# Map racial profile codes to full descriptions
high_income_racial_profile = high_income_crimes \
    .join(race_codes, high_income_crimes["Vict Descent"] == race_codes["Race_Code"], "left") \
    .filter(col("Race_Description").isNotNull()) \
    .groupBy("Race_Description") \
    .agg(count("Vict Descent").alias("Victim_Count")) \
    .orderBy(desc("Victim_Count")) \
    .withColumnRenamed("Race_Description", "Victim Descent") \
    .withColumnRenamed("Victim_Count", "#")

low_income_racial_profile = low_income_crimes \
    .join(race_codes, low_income_crimes["Vict Descent"] == race_codes["Race_Code"], "left") \
    .filter(col("Race_Description").isNotNull()) \
    .groupBy("Race_Description") \
    .agg(count("Vict Descent").alias("Victim_Count")) \
    .orderBy(desc("Victim_Count")) \
    .withColumnRenamed("Race_Description", "Victim Descent") \
    .withColumnRenamed("Victim_Count", "#")

# Show results
print("High-Income Areas:")
high_income_racial_profile.show()

print("Low-Income Areas:")
low_income_racial_profile.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

High-Income Areas:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|580|
|               Other| 85|
|Hispanic/Latin/Me...| 60|
|             Unknown| 51|
|               Black| 38|
|         Other Asian| 19|
|             Chinese|  1|
|American Indian/A...|  1|
+--------------------+---+

Low-Income Areas:
+--------------------+----+
|      Victim Descent|   #|
+--------------------+----+
|Hispanic/Latin/Me...|3941|
|               Black|1170|
|               White| 367|
|               Other| 325|
|         Other Asian|  45|
|             Unknown|  14|
|              Korean|   6|
|American Indian/A...|   2|
|    Pacific Islander|   1|
|            Filipino|   1|
|           Guamanian|   1|
+--------------------+----+

Time taken: 54.35 seconds

In [3]:
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count, desc, regexp_replace, expr
import time

# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("Query 4 - Racial Profile Analysis") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

# Create Sedona context
sedona = SedonaContext.create(spark)
SedonaRegistrator.registerAll(spark)

# Start timing
start_time = time.time()

# Load GeoJSON data for Census Blocks
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = spark.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_df = blocks_df.select(
                [col(f"properties.{col_name}").alias(col_name) for col_name in 
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# Load datasets
crime_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
income_data = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
race_codes = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True) \
            .withColumnRenamed("Vict Descent", "Race_Code") \
            .withColumnRenamed("Vict Descent Full", "Race_Description")

# Load population data from GeoJSON
population_df = blocks_df.select(
    col("properties.ZCTA10").cast("string").alias("Zip Code"),
    col("properties.POP_2010").cast("float").alias("population"),
    col("properties.HOUSING10").cast("float").alias("households")
)

# Aggregate population and household data by ZIP code
population_aggregated_df = population_df.groupBy("Zip Code").agg(
    sum("population").alias("total_population"),
    sum("households").alias("total_households")
)

# Process income data
income_data = income_data.select(
    col("Zip Code").cast("string"),
    regexp_replace(col("Estimated Median Income"), r'[\$,]', '').cast("float").alias("household_income")
)

# Join population and income data by ZIP code
joined_df = population_aggregated_df.join(income_data, "Zip Code", "inner")

# Compute per capita income
joined_df = joined_df.withColumn(
    "per_capita_income", (col("total_households") * col("household_income")) / col("total_population")
)

# Identify the 3 ZIP codes with highest and lowest per capita income
high_income_zips = joined_df.orderBy(desc("per_capita_income")).limit(3)
low_income_zips = joined_df.orderBy("per_capita_income").limit(3)

# Broadcast high-income and low-income ZIP codes
high_income_zip_list = [row["Zip Code"] for row in high_income_zips.collect()]
low_income_zip_list = [row["Zip Code"] for row in low_income_zips.collect()]

# Filter census blocks for high- and low-income areas
high_income_blocks = flattened_df.filter(flattened_df["ZCTA10"].isin(high_income_zip_list))
low_income_blocks = flattened_df.filter(flattened_df["ZCTA10"].isin(low_income_zip_list))

# Filter crime data for 2015
crime_2015 = crime_data.filter(col("DATE OCC").contains("2015"))

# Remove invalid records (Null Island)
crime_2015 = crime_2015.filter((col("LAT").isNotNull()) & (col("LON").isNotNull()) & (col("LAT") != 0) & (col("LON") != 0))

# Add geometry column to crime data
crime_2015 = crime_2015.withColumn("geometry", ST_Point(col("LON"), col("LAT")))

# Join crime data with high- and low-income census blocks using spatial join
high_income_crimes = crime_2015.join(
    high_income_blocks,
    ST_Within(crime_2015.geometry, high_income_blocks.geometry),
    "inner"
)

low_income_crimes = crime_2015.join(
    low_income_blocks,
    ST_Within(crime_2015.geometry, low_income_blocks.geometry),
    "inner"
)

# Map racial profile codes to full descriptions
high_income_racial_profile = high_income_crimes \
    .join(race_codes, high_income_crimes["Vict Descent"] == race_codes["Race_Code"], "left") \
    .filter(col("Race_Description").isNotNull()) \
    .groupBy("Race_Description") \
    .agg(count("Vict Descent").alias("Victim_Count")) \
    .orderBy(desc("Victim_Count")) \
    .withColumnRenamed("Race_Description", "Victim Descent") \
    .withColumnRenamed("Victim_Count", "#")

low_income_racial_profile = low_income_crimes \
    .join(race_codes, low_income_crimes["Vict Descent"] == race_codes["Race_Code"], "left") \
    .filter(col("Race_Description").isNotNull()) \
    .groupBy("Race_Description") \
    .agg(count("Vict Descent").alias("Victim_Count")) \
    .orderBy(desc("Victim_Count")) \
    .withColumnRenamed("Race_Description", "Victim Descent") \
    .withColumnRenamed("Victim_Count", "#")

# Show results
print("High-Income Areas:")
high_income_racial_profile.show()

print("Low-Income Areas:")
low_income_racial_profile.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

High-Income Areas:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|580|
|               Other| 85|
|Hispanic/Latin/Me...| 60|
|             Unknown| 51|
|               Black| 38|
|         Other Asian| 19|
|             Chinese|  1|
|American Indian/A...|  1|
+--------------------+---+

Low-Income Areas:
+--------------------+----+
|      Victim Descent|   #|
+--------------------+----+
|Hispanic/Latin/Me...|3941|
|               Black|1170|
|               White| 367|
|               Other| 325|
|         Other Asian|  45|
|             Unknown|  14|
|              Korean|   6|
|American Indian/A...|   2|
|    Pacific Islander|   1|
|            Filipino|   1|
|           Guamanian|   1|
+--------------------+----+

Time taken: 49.37 seconds