In [5]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4128,application_1732639283265_4068,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4080,application_1732639283265_4020,pyspark,idle,Link,Link,,
4084,application_1732639283265_4024,pyspark,idle,Link,Link,,
4085,application_1732639283265_4025,pyspark,idle,Link,Link,,
4086,application_1732639283265_4026,pyspark,idle,Link,Link,,
4087,application_1732639283265_4027,pyspark,idle,Link,Link,,
4093,application_1732639283265_4033,pyspark,idle,Link,Link,,
4094,application_1732639283265_4034,pyspark,idle,Link,Link,,
4096,application_1732639283265_4036,pyspark,busy,Link,Link,,
4104,application_1732639283265_4044,pyspark,idle,Link,Link,,
4105,application_1732639283265_4045,pyspark,idle,Link,Link,,


In [6]:
from sedona.spark import *
from pyspark.sql.functions import (
    col, sum, regexp_replace, substring, udf, count, to_timestamp, year
)
from pyspark.sql import SparkSession
import time
from pyspark.sql.types import StringType


# Create Spark session
spark = SparkSession.builder.appName("Q4").getOrCreate()
conf = spark.sparkContext.getConf()

# Print Spark executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

# Clear Spark catalog cache
spark.catalog.clearCache()

# Create Sedona context
sedona = SedonaContext.create(spark)

start_time = time.time()

# Load GeoJSON data from S3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = (
    sedona.read.format("geojson")
    .option("multiLine", "true")
    .load(geojson_path)
    .selectExpr("explode(features) as features")
    .select("features.*")
)

# Flatten the GeoJSON structure
flattened_df = (
    blocks_df
    .select(
        [col(f"properties.{col_name}").alias(col_name) for col_name in 
         blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]
    )
    .drop("properties", "type")
)

# Filter and aggregate data for Los Angeles
group_flattened = (
    flattened_df
    .select("COMM", "POP_2010", "ZCTA10", "CITY", "HOUSING10", "geometry")
    .filter(
        (col("CITY") == "Los Angeles") &
        (col("ZCTA10") > 0) &
        (col("HOUSING10") > 0) &
        (col("POP_2010") > 0) &
        (col("COMM") != "")
    )
    .groupBy("COMM", "ZCTA10")
    .agg(
        sum("POP_2010").alias("Total_POP"),
        sum("HOUSING10").alias("Total_Housing"),
        ST_Union_Aggr("geometry").alias("geometry")
    )
)

# Load median housing income data
income_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
median_housing_income = spark.read.csv(income_path, header=True, inferSchema=True)

# Join census and income data and compute GDP per capita
joined = (
    group_flattened
    .join(median_housing_income, group_flattened["ZCTA10"] == median_housing_income["Zip Code"])
    .withColumn("Estimated Median Income", regexp_replace(col("Estimated Median Income"), "[^0-9]", ""))
    .withColumn("ZIP_Total_Income", (col("Estimated Median Income") * col("Total_Housing")))
    .groupBy("COMM")
    .agg(
        sum("Total_POP").alias("Total_COMM_Pop"),
        sum("ZIP_Total_Income").alias("COMM_Total_Income"),
        ST_Union_Aggr("geometry").alias("geometry")
    )
    .withColumn("GDP_Per_Capita", (col("COMM_Total_Income") / col("Total_COMM_Pop")))
    .select("COMM", "GDP_Per_Capita", "geometry")
)

# Get top 3 and bottom 3 communities by GDP per capita
top_3 = joined.orderBy(col("GDP_Per_Capita"), ascending=False).limit(3)
bottom_3 = joined.orderBy(col("GDP_Per_Capita"), ascending=True).limit(3)

# Load and filter crime data for 2015
crime_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_df = spark.read.csv(crime_data_path, header=True, inferSchema=True)

crime_data_df_2015 = (
    crime_data_df
    .withColumn("DATE_OCC_TIMESTAMP", to_timestamp(col("DATE OCC"), "MM/dd/yyyy hh:mm:ss a"))  # Convert to timestamp
    .filter(
        (year(col("DATE_OCC_TIMESTAMP")) == 2015) &  # Extract the year
        (col("Vict Descent") != "")
    )
    .select("DATE OCC", "LAT", "LON", "Vict Descent")
    .withColumn("geom", ST_Point("LON", "LAT"))
    .drop("LAT", "LON")
)

# Load race code data
race_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
race = spark.read.csv(race_path, header=True, inferSchema=True)

# Analyze crimes in top 3 and bottom 3 communities
for label, community_df in [("Top 3", top_3), ("Bottom 3", bottom_3)]:
    crimes_df = (
        crime_data_df_2015
        .join(community_df, ST_Within(crime_data_df_2015.geom, community_df.geometry), "inner")
        .groupBy("Vict Descent")
        .agg(count("*").alias("Count"))
    )
    final_crimes_df = (
        crimes_df
        .join(race, crimes_df["Vict Descent"] == race["Vict Descent"], "inner")
        .select(race["Vict Descent Full"].alias("Victim Descent"), "Count")
        .orderBy(col("Count").desc())
    )
    print(f"{label}:")
    final_crimes_df.show()

# Log execution time and Spark configurations
end_time = time.time()
execution_time = end_time - start_time
print("Execution Time: ", execution_time)

log_data = [{
    "spark_executor_instances": conf.get("spark.executor.instances"),
    "spark_executor_memory": conf.get("spark.executor.memory"),
    "spark_executor_cores": conf.get("spark.executor.cores"),
    "execution_time": execution_time
}]
log_df = spark.createDataFrame(log_data)

log_path = "s3://groups-bucket-dblab-905418150721/group12/logfile.txt"
log_df.write.mode("append").json(log_path)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 8g
Executor Cores: 4
Top 3:
+--------------------+-----+
|      Victim Descent|Count|
+--------------------+-----+
|               White|  649|
|               Other|   72|
|Hispanic/Latin/Me...|   66|
|             Unknown|   38|
|               Black|   37|
|         Other Asian|   21|
|American Indian/A...|    1|
|             Chinese|    1|
+--------------------+-----+

Bottom 3:
+--------------------+-----+
|      Victim Descent|Count|
+--------------------+-----+
|Hispanic/Latin/Me...| 2815|
|               Black|  761|
|               White|  330|
|               Other|  187|
|         Other Asian|  113|
|             Unknown|   22|
|American Indian/A...|   21|
|              Korean|    5|
|             Chinese|    3|
|         AsianIndian|    1|
|            Filipino|    1|
+--------------------+-----+

Execution Time:  58.45950675010681

In [7]:
log_path = f"s3://groups-bucket-dblab-905418150721/group12/logfile.txt"
logs=spark.read.json(log_path)
logs.select("spark_executor_cores", "spark_executor_memory", "execution_time") \
    .orderBy(col("execution_time").asc()) \
    .show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------------------+-----------------+
|spark_executor_cores|spark_executor_memory|   execution_time|
+--------------------+---------------------+-----------------+
|                   4|                   8g|58.45950675010681|
|                   2|                   4g|64.49426293373108|
|                   1|                   2g|82.67877650260925|
+--------------------+---------------------+-----------------+