# Query 5
## DataFrame API

In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "8",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1805,application_1765289937462_1789,pyspark,idle,Link,Link,,
1807,application_1765289937462_1791,pyspark,idle,Link,Link,,
1811,application_1765289937462_1795,pyspark,idle,Link,Link,,
1813,application_1765289937462_1797,pyspark,idle,Link,Link,,
1815,application_1765289937462_1799,pyspark,idle,Link,Link,,


In [2]:
# We initialized a spark session with specific configurations, now we import
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col, regexp_replace, to_timestamp, year, lit, count, sum, corr, desc, asc
from sedona.spark import *
import time

sedona = SedonaContext.create(spark)

#Beginning of timing
start_time = time.time()

# Import census blocks dataset (geojson)
census_raw_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Census_Blocks_2020.geojson") \
            .selectExpr("explode(features) as features") \
            .select("features.*")

census_df = census_raw_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                census_raw_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

census_df = census_df.select(
    col("POP20").cast(DoubleType()).alias("population_2020"),
    col("ZCTA20").alias("zip_code_key"), 
    col("geometry").alias("census_geom"),
    col("HOUSING20").alias("housing_2020"),
    "COMM"
).filter(col("zip_code_key").isNotNull() & (col("population_2020") > 0))

# For income data
income_schema = StructType([
    StructField("Zip Code", StringType(), True),
    StructField("Community", StringType(), True),
    StructField("Estimated Median Income", StringType(), True)
])

income_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_income_2021.csv", \
    header=True, 
    sep=";",              
    schema=income_schema) 

income_df = income_df.select(
    col("Zip Code").cast(StringType()).alias("zip_code_key"), 
    regexp_replace(col("Estimated Median Income"), "[$,]", "").cast(DoubleType()).alias("income_2021") 
)

# Crimes dataset
crime_data_full_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType())
])

# Create DataFrame from main dataset
crime_data_full_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/", \
    header=True, \
    schema=crime_data_full_schema) 

# Filter out nulls and Null Island, shorten the Dataframe and add coordinate points
crime_geo_df = crime_data_full_df.select("DR_NO", "Date Rptd", "LAT", "LON") \
    .filter(col("LAT").isNotNull() & col("LON").isNotNull()) \
    .filter(~((col("LAT") == 0) & (col("LON") == 0))) \
    .withColumn("crime_geom", ST_Point("LON", "LAT"))

# Find the year
crime_geo_df = crime_geo_df.withColumn(
    'Timestamp', to_timestamp(col('Date Rptd'), 'yyyy MMM dd hh:mm:ss a') ) \
    .withColumn('year', year(col('Timestamp')))

crime_geo_df = crime_geo_df.select("DR_NO", "year", "crime_geom") \
    .filter(col("year").isNotNull()) \
    .filter((col("year") == lit(2020)) | (col("year") == lit(2021)))

# House income per block
census_income_df = census_df.join(income_df, on="zip_code_key", how="inner") \
    .withColumn("income_per_block",col("income_2021") * col("housing_2020").cast(DoubleType()))

# Income per person for each community
census_income_df = census_income_df.groupBy("COMM").agg(sum(col("population_2020")).alias("population"), sum(col("income_per_block")).alias("sum_income"))

census_income_df = census_income_df.withColumn("avg income/person", col("sum_income") / col("population")) \
    .drop("sum_income")

# Crimes per Community
zip_crimes_df = crime_geo_df.join(census_df, ST_Contains(col("census_geom"), col("crime_geom")), "inner") \
 .groupBy("COMM",).agg(count("DR_NO").alias("crime_count"))

# Final result
final_df = census_income_df.join(
    zip_crimes_df, 
    on="COMM", 
    how="inner") \
    .withColumn("avg crime/person", col("crime_count")/ (2*col("population")) )\
    .drop("population", "crime_count")

# Show results & correlation
final_df.collect()
print("\n--- Correlation ---")
final_df.select(
    corr("avg crime/person", "avg income/person").alias("Correlation")
).show()

# Top 10
print("Correlation Top 10:")
final_df.orderBy(desc("avg income/person")).limit(10) \
    .select(corr("avg crime/person", "avg income/person").alias("Correlation_Top_10")).show()

# Bottom 10
print("Correlation Bottom 10:")
final_df.orderBy(asc("avg income/person")).limit(10) \
    .select(corr("avg crime/person", "avg income/person").alias("Correlation_Bottom_10")).show()

# End of timing
execution_time = time.time() - start_time
print(f"\nExecution time: {execution_time} seconds")

# Show plan
final_df.explain()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1816,application_1765289937462_1800,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


--- Correlation ---
+--------------------+
|         Correlation|
+--------------------+
|-0.18052205098073226|
+--------------------+

Correlation Top 10:
+-------------------+
| Correlation_Top_10|
+-------------------+
|-0.5235595963528522|
+-------------------+

Correlation Bottom 10:
+---------------------+
|Correlation_Bottom_10|
+---------------------+
|  0.23946733929772732|
+---------------------+


Execution time: 96.10150384902954 seconds
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(6) Project [COMM#50, avg income/person#323, (cast(crime_count#356L as double) / (population#317 * 2.0)) AS avg crime/person#397]
   +- *(6) BroadcastHashJoin [COMM#50], [COMM#371], Inner, BuildLeft, false
      :- BroadcastQueryStage 3
      :  +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]),false), [plan_id=527]
      :     +- *(5) Project [COMM#50, population#317, (sum_income#319 / population#317) AS avg income/person#323]
      

## Other Configurations

In [None]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2"
    }
}

In [None]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}