In [6]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3077,application_1732639283265_3033,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2944,application_1732639283265_2903,pyspark,idle,Link,Link,,
2966,application_1732639283265_2924,pyspark,idle,Link,Link,,
2971,application_1732639283265_2929,pyspark,idle,Link,Link,,
2975,application_1732639283265_2933,pyspark,idle,Link,Link,,
2976,application_1732639283265_2934,pyspark,idle,Link,Link,,
3000,application_1732639283265_2958,pyspark,idle,Link,Link,,
3005,application_1732639283265_2963,pyspark,idle,Link,Link,,
3010,application_1732639283265_2968,pyspark,idle,Link,Link,,
3017,application_1732639283265_2975,pyspark,idle,Link,Link,,
3019,application_1732639283265_2977,pyspark,idle,Link,Link,,


In [10]:
from sedona.spark import *
from pyspark.sql.functions import col, sum, regexp_replace, substring, udf, count
from pyspark.sql import SparkSession
import time
from pyspark.sql.types import StringType


# Create spark Session
spark = SparkSession.builder \
    .appName("Q4") \
    .getOrCreate()

conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))


spark.catalog.clearCache()


# Create sedona context
sedona = SedonaContext.create(spark)

start_time = time.time()

# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

#flattened_df.printSchema()

#select only desired columns from 2010_Census_Blocks.geojson, filter data only from LA, keep only valid data and then do summations for Housing and Population and create geometry
group_flattened = (
    flattened_df
    .select("COMM", "POP_2010", "ZCTA10", "CITY", "HOUSING10", "geometry")
    .filter(
        (col("CITY") == "Los Angeles") &
        (col("ZCTA10") > 0) &
        (col("HOUSING10") > 0) &
        (col("POP_2010") > 0) &
        (col("COMM") != "")
    )
    .groupBy("COMM", "ZCTA10")
    .agg(
        sum("POP_2010").alias("Total_POP"),
        sum("HOUSING10").alias("Total_Housing"),
        ST_Union_Aggr("geometry").alias("geometry")
    )
)

#group_flattened.show()

median_housing_income = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)
#median_housing_income.show()

#join Cencus with median house income. Join key is Zip Code. Result is GDP Per Capita in every area.
joined = (
    group_flattened
    .join(median_housing_income, group_flattened["ZCTA10"] == median_housing_income["Zip Code"])
    .withColumn(
        "Estimated Median Income", 
        regexp_replace(col("Estimated Median Income"), "[^0-9]", "")
    )
    .withColumn(
        "ZIP_Total_Income", 
        (col("Estimated Median Income") * col("Total_Housing"))
    )
    .groupBy("COMM")
    .agg(
        sum("Total_POP").alias("Total_COMM_Pop"),
        sum("ZIP_Total_Income").alias("COMM_Total_Income"),
        ST_Union_Aggr("geometry").alias("geometry")
    )
    .withColumn(
        "GDP_Per_Capita",
        (col("COMM_Total_Income")/col("Total_COMM_Pop"))
    )
    .select("COMM", "GDP_Per_Capita", "geometry") 
)

#joined.show()

top_3 = joined.orderBy(col("GDP_Per_Capita"), ascending=False).limit(3)

bottom_3 = joined.orderBy(col("GDP_Per_Capita"), ascending=True).limit(3)

#print("Top 3 by GDP_Per_Capita:")
#top_3.show()

#print("Bottom 3 by GDP_Per_Capita:")
#bottom_3.show()

crime_data_csv = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_df = spark.read.csv(crime_data_csv, header=True, inferSchema=True)

#keep crime data only from 2015 and and Victim Descent is mentioned. Also create a geometry column according to longtitude and latitude columns 
crime_data_df_2015 = (
    crime_data_df
    .filter(
        (substring(col("DATE OCC"), 7, 4) == "2015") &
        (col("Vict Descent") != "")
    )
    .select("DATE OCC", "LAT", "LON", "Vict Descent")
    .withColumn("geom", ST_Point("LON", "LAT"))
    .drop("LAT", "LON")
)

#crime_data_df_2015.show()

race_csv = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"
race = spark.read.csv(race_csv, header=True, inferSchema=True)
#race.show()

#join top3 GDP per capita with crime data. Join is done with the geometry
top3_crimes_df = (
    crime_data_df_2015
    .join(top_3, ST_Within(crime_data_df_2015.geom, top_3.geometry), "inner")
    .groupBy("Vict Descent")
    .agg(
        count("*").alias("#")
    )
)
#join previous Dataframe with a race Dataframe
final_top_3_df = (
    top3_crimes_df
    .join(race, top3_crimes_df["`Vict Descent`"] == race["`Vict Descent`"], "inner")
    .select(
        race["`Vict Descent Full`"].alias("Victim Descent"),
        "#"
    )
    .orderBy(col("#").desc())
)

#final_top_3_df.show()

bottom3_crimes_df = (
    crime_data_df_2015
    .join(bottom_3, ST_Within(crime_data_df_2015.geom, bottom_3.geometry), "inner")
    .groupBy("Vict Descent")
    .agg(
        count("*").alias("#")
    )
)

final_bottom_3_df = (
    bottom3_crimes_df
    .join(race, bottom3_crimes_df["`Vict Descent`"] == race["`Vict Descent`"], "inner")
    .select(
        race["`Vict Descent Full`"].alias("Victim Descent"),
        "#"
    )
    .orderBy(col("#").desc())
)


print("Top 3:")
final_top_3_df.show()

print("Bottom 3:")
final_bottom_3_df.show()

end_time = time.time()

execution_time = end_time - start_time
print("Execution Time: ", execution_time)

conf = spark.sparkContext.getConf()

log_data = {
    "spark_executor_instances": conf.get("spark.executor.instances"),
    "spark_executor_memory": conf.get("spark.executor.memory"),
    "spark_executor_cores": conf.get("spark.executor.cores"),
    "execution_time": execution_time
}

log_df = spark.createDataFrame([log_data])

s3_path = f"s3://groups-bucket-dblab-905418150721/group45/q4/logfile.txt"

log_df.write.mode("append").json(s3_path)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 8g
Executor Cores: 4
Top 3:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|649|
|               Other| 72|
|Hispanic/Latin/Me...| 66|
|             Unknown| 38|
|               Black| 37|
|         Other Asian| 21|
|American Indian/A...|  1|
|             Chinese|  1|
+--------------------+---+

Bottom 3:
+--------------------+----+
|      Victim Descent|   #|
+--------------------+----+
|Hispanic/Latin/Me...|2815|
|               Black| 761|
|               White| 330|
|               Other| 187|
|         Other Asian| 113|
|             Unknown|  22|
|American Indian/A...|  21|
|              Korean|   5|
|             Chinese|   3|
|         AsianIndian|   1|
|            Filipino|   1|
+--------------------+----+

Execution Time:  60.674782514572144

In [16]:
log_path = f"s3://groups-bucket-dblab-905418150721/group45/q4/logfile.txt"
logs=spark.read.json(log_path)
logs.select("spark_executor_cores", "spark_executor_memory", "execution_time").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------------------+------------------+
|spark_executor_cores|spark_executor_memory|    execution_time|
+--------------------+---------------------+------------------+
|                   4|                   8g|60.674782514572144|
|                   2|                   4g| 70.12353825569153|
|                   1|                   2g|  79.3983838558197|
+--------------------+---------------------+------------------+