In [17]:
#Query 3 Sedona for LA
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, TimestampType
from pyspark.sql.functions import col, broadcast, date_format, to_timestamp, round, format_number, concat, lit, when, count, sum, regexp_replace, trim
import time
import csv

# Create spark Session
spark = SparkSession.builder \
    .appName("Initial Code") \
    .getOrCreate()
spark.catalog.clearCache()
# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type") \
            .filter(col("CITY") == "Los Angeles")

#Query 3 Initialisation with Dataframe
start_time = time.time()

# Crime dataframe with parquet for quicker esults
crime_df = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group12/main_dataset_parquet")\
                .filter((col("LON") != 0) | (col("LAT") != 0))  \
                .withColumn("point",ST_Point("LON", "LAT")) \
                .select("point")

# Income dataframe with median income
income_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv",
    header=True,
    inferSchema=True
)

res_income = income_df.withColumn("Estimated_Median_Income",
    regexp_replace(trim(col("Estimated Median Income")), "[^0-9]", "").cast(IntegerType())
    ).drop("Community","Estimated Median Income")

# Filter population for total population anf houses
zip_comm_population = flattened_df.select("ZCTA10", "COMM", "POP_2010", "HOUSING10", "geometry") \
    .filter((col("ZCTA10") > 0) & (col("POP_2010") > 0) & (col("HOUSING10") > 0) & (trim(col("COMM")) != "")) \
    .groupBy("ZCTA10", "COMM") \
    .agg(
        sum("POP_2010").alias("Total_Population_Zip_COMM"),
        sum("HOUSING10").alias("Total_Housing"),
        ST_Union_Aggr("geometry").alias("geometry")
    )
    
# Join incoma and cansus using zip coded

joined_income_df = zip_comm_population.join(
   res_income,      #BROADCAST
    zip_comm_population["ZCTA10"] == res_income["Zip Code"],
    "inner"  
    ) \
    .drop("ZCTA10","Zip Code") \
    .withColumn("Total income",col("Total_Housing") * col("Estimated_Median_Income")) \
    .drop("Estimated_Median_Income") \
    .groupBy("COMM") \
    .agg(sum("Total_Population_Zip_COMM")
    .alias("Total_Population"),sum("Total income")
    .alias("Total Income"),ST_Union_Aggr("geometry")
    .alias("geometry")) 


results = joined_income_df.join(
                crime_df,    #BROADCAST
                ST_Within(crime_df.point, 
                joined_income_df.geometry), "inner") \
                .groupBy("COMM","Total_Population","Total Income") \
                .agg(count(col("*")).alias("Total crimes")) \
                .withColumn("crime_rate",format_number((col("Total crimes")/col("Total_Population")),4)) \
                .withColumn("estimated_median_income",concat(lit("$"),format_number(round(col("Total Income") / col("Total_Population")), 0))) \
                .drop("Total Income","Total_Population","Total crimes") \
                .withColumnRenamed("COMM", "Location") \
                .orderBy(col("crime_rate").desc())
results.show() 
end_time = time.time()
# End timing
print(f"Execution Time : {end_time - start_time} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+----------+-----------------------+
|          Location|crime_rate|estimated_median_income|
+------------------+----------+-----------------------+
|      Little Tokyo|    2.9024|                $16,505|
|          Downtown|    1.9612|                $19,908|
|Wholesale District|    1.5441|                 $9,848|
|         Hollywood|    1.3846|                $25,729|
|     Vermont Vista|    1.2551|                 $8,367|
|Century Palms/Cove|    1.1446|                 $8,610|
|   Sycamore Square|    1.1276|                $30,117|
| Manchester Square|    1.0804|                $14,590|
|    Vermont Knolls|    1.0672|                $10,494|
|    Gramercy Place|    1.0648|                $14,937|
|Florence-Firestone|    1.0589|                 $8,079|
|      Leimert Park|    1.0561|                $16,104|
|            Venice|    1.0404|                $47,625|
|       West Vernon|    1.0377|                 $8,723|
|         Hyde Park|    1.0300|                $

In [11]:
# Explain to get the physican plan of the catalyst optimizer
results.explain(mode="formatted")

# Παρατηρούμε ότι χρησιμοποιεί το Broadcast

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan (29)
+- Sort (28)
   +- Exchange (27)
      +- HashAggregate (26)
         +- Exchange (25)
            +- HashAggregate (24)
               +- Project (23)
                  +- RangeJoin (22)
                     :- Filter (18)
                     :  +- ObjectHashAggregate (17)
                     :     +- Exchange (16)
                     :        +- ObjectHashAggregate (15)
                     :           +- Project (14)
                     :              +- BroadcastHashJoin Inner BuildRight (13)
                     :                 :- ObjectHashAggregate (8)
                     :                 :  +- Exchange (7)
                     :                 :     +- ObjectHashAggregate (6)
                     :                 :        +- Project (5)
                     :                 :           +- Filter (4)
                     :                 :              +- Generate (3)
                     :                 :                 

In [18]:
from pyspark.sql.functions import col, format_number
#Print the results
results_df = spark.read.json("s3://groups-bucket-dblab-905418150721/group12/Q3/hint_res.json")
results_df.select(col("Join_Strategy"),format_number(col("Execution_Time"), 2).alias("Execution_Time")).orderBy("Execution_Time").show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------+
|Join_Strategy       |Execution_Time|
+--------------------+--------------+
|BROADCAST           |2.64          |
|SHUFFLE_HASH        |4.53          |
|SHUFFLE_REPLICATE_NL|6.16          |
|MERGE               |7.86          |
+--------------------+--------------+