# Experimentation with BROADCAST, MERGE, SHUFFLE_HASH, SHUFFLE_REPLICATE_NL

In [1]:
#Load data
from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as _sum
from pyspark.sql import SparkSession
import time

spark = SparkSession.builder \
    .appName("Join Strategies") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")
#crime datas

data_path = 's3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv'

df = spark.read.csv(data_path, header=True, inferSchema=True)

data2_path = 's3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv'
df2 = spark.read.csv(data2_path,header = True, inferSchema = True)

df_combined = df.union(df2)

#load data from LA income
data3_path = 's3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv'
df3 = spark.read.csv(data3_path,header = True, inferSchema = True)



Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1863,application_1732639283265_1824,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Broadcast

In [2]:
print("Task_1")
start_time = time.time()

from pyspark.sql import functions as F

# Join με την βάση df3
joined_df = flattened_df.hint("broadcast").join(df3, flattened_df["COMM"] == df3["Community"], "inner")

# Καθαρισμός της στήλης "Estimated Median Income"
joined_df = joined_df.withColumn(
    "Cleaned Estimated Median Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
)
joined_df.select("Community","Cleaned Estimated Median Income","POP_2010","HOUSING10")
# Ομαδοποίηση κατά την περιοχή "Community" και υπολογισμός των απαιτούμενων τιμών
grouped_df = joined_df.groupBy("Community").agg(
    F.sum("POP_2010").alias("Total Population"),
    F.sum("HOUSING10").alias("Total Households"),
    F.avg("Cleaned Estimated Median Income").alias("Avg Estimated Median Income")
)

# Υπολογισμός της στήλης "Income per Individual"
grouped_df = grouped_df.withColumn(
    "Income per Individual",
    (F.col("Total Households") * F.col("Avg Estimated Median Income")) / F.col("Total Population")
)

# Δημιουργία της στήλης με το δολάριο για τα αποτελέσματα
grouped_df = grouped_df.withColumn(
    "Estimated Median Income per Household",
    F.concat(F.lit("$"), F.format_number(F.col("Avg Estimated Median Income"), 2))
)

grouped_df = grouped_df.withColumn(
    "Income per Individual",
    F.concat(F.lit("$"), F.format_number(F.col("Income per Individual"), 2))
)

# Επιλογή των στηλών για εμφάνιση
result_df = grouped_df.select(
    "Community", 
    "Total Population", 
    "Total Households", 
    "Estimated Median Income per Household",
    "Income per Individual"
)

# Εμφάνιση του αποτελέσματος
result_df.show(truncate=False,n=10)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Task_1
+----------------+----------------+----------------+-------------------------------------+---------------------+
|Community       |Total Population|Total Households|Estimated Median Income per Household|Income per Individual|
+----------------+----------------+----------------+-------------------------------------+---------------------+
|Culver City     |77766           |34982           |$75,913.50                           |$34,148.68           |
|Pico Rivera     |62942           |17109           |$55,758.00                           |$15,156.23           |
|Malibu          |12645           |6864            |$123,681.00                          |$67,136.92           |
|Hacienda Heights|53594           |16524           |$78,000.00                           |$24,048.81           |
|Montebello      |62500           |19768           |$45,898.00                           |$14,516.99           |
|Hawaiian Gardens|14254           |3703            |$37,543.00                           

In [3]:
print("Task_2")
start_time = time.time()

# Βήμα 1: Προσθήκη στήλης για γεωμετρικά σημεία στα δεδομένα εγκλημάτων
df_combined = df_combined.withColumn("geom", ST_Point("LON", "LAT"))

# Βήμα 2: Εντοπισμός αν τα σημεία ανήκουν σε κάποιο πολύγωνο
df_joined = flattened_df.join(
    df_combined,
    ST_Contains(flattened_df["geometry"], df_combined["geom"]),
    how="inner"
)
df_zip = df_joined.join(df3, df_joined["COMM"] == df3["Community"], "inner")

df_zip.select("Community","geometry")
# Βήμα 3: Προσθήκη στήλης "Sum of Crimes"
df_aggregated = df_zip.groupBy("Community").agg(
    _sum(col("geometry").isNotNull().cast("int")).alias("Sum_of_Crimes")
)

df_final=result_df.join(df_aggregated, result_df['Community'] == df_aggregated['Community'], "left")

# Βήμα 4: Υπολογισμός της στήλης "Ratio_of_Crimes_Per_Person"
df_final = df_final.withColumn(
    "Ratio_of_Crimes_Per_Person", 
    col("Sum_of_Crimes") / col("Total Population")
)

# Βήμα 5: Εμφάνιση των αποτελεσμάτων
df_final = df_final.select(result_df['Community'],"Total Population","Sum_of_Crimes","Ratio_of_Crimes_Per_Person")

# Προβολή των αποτελεσμάτων
df_final.show(truncate=False,n=10)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Task_2
+----------------+----------------+-------------+--------------------------+
|Community       |Total Population|Sum_of_Crimes|Ratio_of_Crimes_Per_Person|
+----------------+----------------+-------------+--------------------------+
|Culver City     |77766           |2780         |0.03574827045238279       |
|Pico Rivera     |62942           |2            |3.177528518318452E-5      |
|Malibu          |12645           |1            |7.908264136022143E-5      |
|Hacienda Heights|53594           |NULL         |NULL                      |
|Montebello      |62500           |6            |9.6E-5                    |
|Hawaiian Gardens|14254           |NULL         |NULL                      |
|Westlake Village|16540           |2            |1.2091898428053205E-4     |
|Carson          |183428          |862          |0.004699391586889679      |
|Glendale        |1150314         |816          |7.093715281218867E-4      |
|Signal Hill     |11016           |2            |1.8155410312273057E-

In [4]:
result_df.explain(mode="formatted")
df_final.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan (15)
+- Project (14)
   +- HashAggregate (13)
      +- Exchange (12)
         +- HashAggregate (11)
            +- Project (10)
               +- BroadcastHashJoin Inner BuildLeft (9)
                  :- BroadcastExchange (6)
                  :  +- Project (5)
                  :     +- Filter (4)
                  :        +- Generate (3)
                  :           +- Filter (2)
                  :              +- Scan geojson  (1)
                  +- Filter (8)
                     +- Scan csv  (7)


(1) Scan geojson 
Output [1]: [features#25]
Batched: false
Location: InMemoryFileIndex [s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson]
PushedFilters: [IsNotNull(features)]
ReadSchema: struct<features:array<struct<geometry:binary,properties:struct<BG10:string,BG10FIP10:string,BG12:string,CB10:string,CEN_FIP13:string,CITY:string,CITYCOM:string,COMM:string,CT10:string,CT12:string,CTCB10:string,HD_2012:bigint,HD_NA

## Merge

In [5]:
print("Task_1")
start_time = time.time()

from pyspark.sql import functions as F

# Join με την βάση df3
joined_df = flattened_df.hint("merge").join(df3, flattened_df["COMM"] == df3["Community"], "inner")

# Καθαρισμός της στήλης "Estimated Median Income"
joined_df = joined_df.withColumn(
    "Cleaned Estimated Median Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
)
joined_df.select("Community","Cleaned Estimated Median Income","POP_2010","HOUSING10")
# Ομαδοποίηση κατά την περιοχή "Community" και υπολογισμός των απαιτούμενων τιμών
grouped_df = joined_df.groupBy("Community").agg(
    F.sum("POP_2010").alias("Total Population"),
    F.sum("HOUSING10").alias("Total Households"),
    F.avg("Cleaned Estimated Median Income").alias("Avg Estimated Median Income")
)

# Υπολογισμός της στήλης "Income per Individual"
grouped_df = grouped_df.withColumn(
    "Income per Individual",
    (F.col("Total Households") * F.col("Avg Estimated Median Income")) / F.col("Total Population")
)

# Δημιουργία της στήλης με το δολάριο για τα αποτελέσματα
grouped_df = grouped_df.withColumn(
    "Estimated Median Income per Household",
    F.concat(F.lit("$"), F.format_number(F.col("Avg Estimated Median Income"), 2))
)

grouped_df = grouped_df.withColumn(
    "Income per Individual",
    F.concat(F.lit("$"), F.format_number(F.col("Income per Individual"), 2))
)

# Επιλογή των στηλών για εμφάνιση
result_df = grouped_df.select(
    "Community", 
    "Total Population", 
    "Total Households", 
    "Estimated Median Income per Household",
    "Income per Individual"
)

# Εμφάνιση του αποτελέσματος
result_df.show(truncate=False,n=10)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Task_1
+----------------+----------------+----------------+-------------------------------------+---------------------+
|Community       |Total Population|Total Households|Estimated Median Income per Household|Income per Individual|
+----------------+----------------+----------------+-------------------------------------+---------------------+
|Carson          |183428          |52452           |$74,351.50                           |$21,261.12           |
|Claremont       |35348           |12306           |$89,161.00                           |$31,040.38           |
|Culver City     |77766           |34982           |$75,913.50                           |$34,148.68           |
|Gardena         |176487          |64416           |$49,137.67                           |$17,934.76           |
|Glendale        |1150314         |457614          |$66,520.33                           |$26,462.89           |
|Glendora        |101388          |35964           |$76,010.00                           

In [6]:
print("Task_2")
start_time = time.time()

# Βήμα 1: Προσθήκη στήλης για γεωμετρικά σημεία στα δεδομένα εγκλημάτων
df_combined = df_combined.withColumn("geom", ST_Point("LON", "LAT"))

# Βήμα 2: Εντοπισμός αν τα σημεία ανήκουν σε κάποιο πολύγωνο
df_joined = flattened_df.hint("merge").join(
    df_combined,
    ST_Contains(flattened_df["geometry"], df_combined["geom"]),
    how="inner"
)
df_zip = df_joined.hint("merge").join(df3, df_joined["COMM"] == df3["Community"], "inner")

df_zip.select("Community","geometry")
# Βήμα 3: Προσθήκη στήλης "Sum of Crimes"
df_aggregated = df_zip.groupBy("Community").agg(
    _sum(col("geometry").isNotNull().cast("int")).alias("Sum_of_Crimes")
)

df_final=result_df.hint("merge").join(df_aggregated, result_df['Community'] == df_aggregated['Community'], "left")

# Βήμα 4: Υπολογισμός της στήλης "Ratio_of_Crimes_Per_Person"
df_final = df_final.withColumn(
    "Ratio_of_Crimes_Per_Person", 
    col("Sum_of_Crimes") / col("Total Population")
)

# Βήμα 5: Εμφάνιση των αποτελεσμάτων
df_final = df_final.select(result_df['Community'],"Total Population","Sum_of_Crimes","Ratio_of_Crimes_Per_Person")

# Προβολή των αποτελεσμάτων
df_final.show(truncate=False,n=10)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Task_2
+----------------+----------------+-------------+--------------------------+
|Community       |Total Population|Sum_of_Crimes|Ratio_of_Crimes_Per_Person|
+----------------+----------------+-------------+--------------------------+
|Culver City     |77766           |2780         |0.03574827045238279       |
|Hacienda Heights|53594           |NULL         |NULL                      |
|Hawaiian Gardens|14254           |NULL         |NULL                      |
|Malibu          |12645           |1            |7.908264136022143E-5      |
|Montebello      |62500           |6            |9.6E-5                    |
|Pico Rivera     |62942           |2            |3.177528518318452E-5      |
|Westlake Village|16540           |2            |1.2091898428053205E-4     |
|Carson          |183428          |862          |0.004699391586889679      |
|Glendale        |1150314         |816          |7.093715281218867E-4      |
|Claremont       |35348           |6            |1.697408622835804E-4

In [7]:
result_df.explain(mode="formatted")
df_final.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan (17)
+- Project (16)
   +- HashAggregate (15)
      +- HashAggregate (14)
         +- Project (13)
            +- SortMergeJoin Inner (12)
               :- Sort (7)
               :  +- Exchange (6)
               :     +- Project (5)
               :        +- Filter (4)
               :           +- Generate (3)
               :              +- Filter (2)
               :                 +- Scan geojson  (1)
               +- Sort (11)
                  +- Exchange (10)
                     +- Filter (9)
                        +- Scan csv  (8)


(1) Scan geojson 
Output [1]: [features#25]
Batched: false
Location: InMemoryFileIndex [s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson]
PushedFilters: [IsNotNull(features)]
ReadSchema: struct<features:array<struct<geometry:binary,properties:struct<BG10:string,BG10FIP10:string,BG12:string,CB10:string,CEN_FIP13:string,CITY:string,CITYCOM:string,COMM:string,CT10:string,CT12

## Shuffle_Hash

In [8]:
print("Task_1")
start_time = time.time()

from pyspark.sql import functions as F

# Join με την βάση df3
joined_df = flattened_df.hint("shuffle_hash").join(df3, flattened_df["COMM"] == df3["Community"], "inner")

# Καθαρισμός της στήλης "Estimated Median Income"
joined_df = joined_df.withColumn(
    "Cleaned Estimated Median Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
)
joined_df.select("Community","Cleaned Estimated Median Income","POP_2010","HOUSING10")
# Ομαδοποίηση κατά την περιοχή "Community" και υπολογισμός των απαιτούμενων τιμών
grouped_df = joined_df.groupBy("Community").agg(
    F.sum("POP_2010").alias("Total Population"),
    F.sum("HOUSING10").alias("Total Households"),
    F.avg("Cleaned Estimated Median Income").alias("Avg Estimated Median Income")
)

# Υπολογισμός της στήλης "Income per Individual"
grouped_df = grouped_df.withColumn(
    "Income per Individual",
    (F.col("Total Households") * F.col("Avg Estimated Median Income")) / F.col("Total Population")
)

# Δημιουργία της στήλης με το δολάριο για τα αποτελέσματα
grouped_df = grouped_df.withColumn(
    "Estimated Median Income per Household",
    F.concat(F.lit("$"), F.format_number(F.col("Avg Estimated Median Income"), 2))
)

grouped_df = grouped_df.withColumn(
    "Income per Individual",
    F.concat(F.lit("$"), F.format_number(F.col("Income per Individual"), 2))
)

# Επιλογή των στηλών για εμφάνιση
result_df = grouped_df.select(
    "Community", 
    "Total Population", 
    "Total Households", 
    "Estimated Median Income per Household",
    "Income per Individual"
)

# Εμφάνιση του αποτελέσματος
result_df.show(truncate=False,n=10)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Task_1
+----------------+----------------+----------------+-------------------------------------+---------------------+
|Community       |Total Population|Total Households|Estimated Median Income per Household|Income per Individual|
+----------------+----------------+----------------+-------------------------------------+---------------------+
|Culver City     |77766           |34982           |$75,913.50                           |$34,148.68           |
|Pico Rivera     |62942           |17109           |$55,758.00                           |$15,156.23           |
|Malibu          |12645           |6864            |$123,681.00                          |$67,136.92           |
|Hacienda Heights|53594           |16524           |$78,000.00                           |$24,048.81           |
|Montebello      |62500           |19768           |$45,898.00                           |$14,516.99           |
|Hawaiian Gardens|14254           |3703            |$37,543.00                           

In [9]:
print("Task_2")
start_time = time.time()

# Βήμα 1: Προσθήκη στήλης για γεωμετρικά σημεία στα δεδομένα εγκλημάτων
df_combined = df_combined.withColumn("geom", ST_Point("LON", "LAT"))

# Βήμα 2: Εντοπισμός αν τα σημεία ανήκουν σε κάποιο πολύγωνο
df_joined = flattened_df.hint("shuffle_hash").join(
    df_combined,
    ST_Contains(flattened_df["geometry"], df_combined["geom"]),
    how="inner"
)
df_zip = df_joined.hint("shuffle_hash").join(df3, df_joined["COMM"] == df3["Community"], "inner")

df_zip.select("Community","geometry")
# Βήμα 3: Προσθήκη στήλης "Sum of Crimes"
df_aggregated = df_zip.groupBy("Community").agg(
    _sum(col("geometry").isNotNull().cast("int")).alias("Sum_of_Crimes")
)

df_final=result_df.hint("shuffle_hash").join(df_aggregated, result_df['Community'] == df_aggregated['Community'], "left")

# Βήμα 4: Υπολογισμός της στήλης "Ratio_of_Crimes_Per_Person"
df_final = df_final.withColumn(
    "Ratio_of_Crimes_Per_Person", 
    col("Sum_of_Crimes") / col("Total Population")
)

# Βήμα 5: Εμφάνιση των αποτελεσμάτων
df_final = df_final.select(result_df['Community'],"Total Population","Sum_of_Crimes","Ratio_of_Crimes_Per_Person")

# Προβολή των αποτελεσμάτων
df_final.show(truncate=False,n=10)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Task_2
+----------------+----------------+-------------+--------------------------+
|Community       |Total Population|Sum_of_Crimes|Ratio_of_Crimes_Per_Person|
+----------------+----------------+-------------+--------------------------+
|Culver City     |77766           |2780         |0.03574827045238279       |
|Pico Rivera     |62942           |2            |3.177528518318452E-5      |
|Malibu          |12645           |1            |7.908264136022143E-5      |
|Montebello      |62500           |6            |9.6E-5                    |
|Westlake Village|16540           |2            |1.2091898428053205E-4     |
|Hacienda Heights|53594           |NULL         |NULL                      |
|Hawaiian Gardens|14254           |NULL         |NULL                      |
|Carson          |183428          |862          |0.004699391586889679      |
|Glendale        |1150314         |816          |7.093715281218867E-4      |
|Signal Hill     |11016           |2            |1.8155410312273057E-

In [10]:
result_df.explain(mode="formatted")
df_final.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan (15)
+- Project (14)
   +- HashAggregate (13)
      +- HashAggregate (12)
         +- Project (11)
            +- ShuffledHashJoin Inner BuildLeft (10)
               :- Exchange (6)
               :  +- Project (5)
               :     +- Filter (4)
               :        +- Generate (3)
               :           +- Filter (2)
               :              +- Scan geojson  (1)
               +- Exchange (9)
                  +- Filter (8)
                     +- Scan csv  (7)


(1) Scan geojson 
Output [1]: [features#25]
Batched: false
Location: InMemoryFileIndex [s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson]
PushedFilters: [IsNotNull(features)]
ReadSchema: struct<features:array<struct<geometry:binary,properties:struct<BG10:string,BG10FIP10:string,BG12:string,CB10:string,CEN_FIP13:string,CITY:string,CITYCOM:string,COMM:string,CT10:string,CT12:string,CTCB10:string,HD_2012:bigint,HD_NAME:string,HOUSING10:bigint,L

## Shuffle_Replicate_NL

In [11]:
print("Task_1")
start_time = time.time()

from pyspark.sql import functions as F

# Join με την βάση df3
joined_df = flattened_df.hint("shuffle_replicate_nl").join(df3, flattened_df["COMM"] == df3["Community"], "inner")

# Καθαρισμός της στήλης "Estimated Median Income"
joined_df = joined_df.withColumn(
    "Cleaned Estimated Median Income",
    F.regexp_replace(F.col("Estimated Median Income"), "[$,]", "").cast("double")
)
joined_df.select("Community","Cleaned Estimated Median Income","POP_2010","HOUSING10")
# Ομαδοποίηση κατά την περιοχή "Community" και υπολογισμός των απαιτούμενων τιμών
grouped_df = joined_df.groupBy("Community").agg(
    F.sum("POP_2010").alias("Total Population"),
    F.sum("HOUSING10").alias("Total Households"),
    F.avg("Cleaned Estimated Median Income").alias("Avg Estimated Median Income")
)

# Υπολογισμός της στήλης "Income per Individual"
grouped_df = grouped_df.withColumn(
    "Income per Individual",
    (F.col("Total Households") * F.col("Avg Estimated Median Income")) / F.col("Total Population")
)

# Δημιουργία της στήλης με το δολάριο για τα αποτελέσματα
grouped_df = grouped_df.withColumn(
    "Estimated Median Income per Household",
    F.concat(F.lit("$"), F.format_number(F.col("Avg Estimated Median Income"), 2))
)

grouped_df = grouped_df.withColumn(
    "Income per Individual",
    F.concat(F.lit("$"), F.format_number(F.col("Income per Individual"), 2))
)

# Επιλογή των στηλών για εμφάνιση
result_df = grouped_df.select(
    "Community", 
    "Total Population", 
    "Total Households", 
    "Estimated Median Income per Household",
    "Income per Individual"
)

# Εμφάνιση του αποτελέσματος
result_df.show(truncate=False,n=10)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Task_1
+----------------+----------------+----------------+-------------------------------------+---------------------+
|Community       |Total Population|Total Households|Estimated Median Income per Household|Income per Individual|
+----------------+----------------+----------------+-------------------------------------+---------------------+
|Culver City     |77766           |34982           |$75,913.50                           |$34,148.68           |
|Pico Rivera     |62942           |17109           |$55,758.00                           |$15,156.23           |
|Malibu          |12645           |6864            |$123,681.00                          |$67,136.92           |
|Hacienda Heights|53594           |16524           |$78,000.00                           |$24,048.81           |
|Montebello      |62500           |19768           |$45,898.00                           |$14,516.99           |
|Hawaiian Gardens|14254           |3703            |$37,543.00                           

In [12]:
print("Task_2")
start_time = time.time()

# Βήμα 1: Προσθήκη στήλης για γεωμετρικά σημεία στα δεδομένα εγκλημάτων
df_combined = df_combined.withColumn("geom", ST_Point("LON", "LAT"))

# Βήμα 2: Εντοπισμός αν τα σημεία ανήκουν σε κάποιο πολύγωνο
df_joined = flattened_df.hint("shuffle_replicate_nl").join(
    df_combined,
    ST_Contains(flattened_df["geometry"], df_combined["geom"]),
    how="inner"
)
df_zip = df_joined.hint("shuffle_replicate_nl").join(df3, df_joined["COMM"] == df3["Community"], "inner")
df_zip.select("Community","geometry")
# Βήμα 3: Προσθήκη στήλης "Sum of Crimes"
df_aggregated = df_zip.groupBy("Community").agg(
    _sum(col("geometry").isNotNull().cast("int")).alias("Sum_of_Crimes")
)

df_final=result_df.hint("shuffle_replicate_nl").join(df_aggregated, result_df['Community'] == df_aggregated['Community'], "left")

# Βήμα 4: Υπολογισμός της στήλης "Ratio_of_Crimes_Per_Person"
df_final = df_final.withColumn(
    "Ratio_of_Crimes_Per_Person", 
    col("Sum_of_Crimes") / col("Total Population")
)

# Βήμα 5: Εμφάνιση των αποτελεσμάτων
df_final = df_final.select(result_df['Community'],"Total Population","Sum_of_Crimes","Ratio_of_Crimes_Per_Person")

# Προβολή των αποτελεσμάτων
df_final.show(truncate=False,n=10)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Task_2
+----------------+----------------+-------------+--------------------------+
|Community       |Total Population|Sum_of_Crimes|Ratio_of_Crimes_Per_Person|
+----------------+----------------+-------------+--------------------------+
|Culver City     |77766           |2780         |0.03574827045238279       |
|Pico Rivera     |62942           |2            |3.177528518318452E-5      |
|Malibu          |12645           |1            |7.908264136022143E-5      |
|Hacienda Heights|53594           |NULL         |NULL                      |
|Montebello      |62500           |6            |9.6E-5                    |
|Hawaiian Gardens|14254           |NULL         |NULL                      |
|Westlake Village|16540           |2            |1.2091898428053205E-4     |
|Carson          |183428          |862          |0.004699391586889679      |
|Glendale        |1150314         |816          |7.093715281218867E-4      |
|Signal Hill     |11016           |2            |1.8155410312273057E-

In [13]:
result_df.explain(mode="formatted")
df_final.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan (14)
+- Project (13)
   +- HashAggregate (12)
      +- Exchange (11)
         +- HashAggregate (10)
            +- Project (9)
               +- CartesianProduct Inner (8)
                  :- Project (5)
                  :  +- Filter (4)
                  :     +- Generate (3)
                  :        +- Filter (2)
                  :           +- Scan geojson  (1)
                  +- Filter (7)
                     +- Scan csv  (6)


(1) Scan geojson 
Output [1]: [features#25]
Batched: false
Location: InMemoryFileIndex [s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson]
PushedFilters: [IsNotNull(features)]
ReadSchema: struct<features:array<struct<geometry:binary,properties:struct<BG10:string,BG10FIP10:string,BG12:string,CB10:string,CEN_FIP13:string,CITY:string,CITYCOM:string,COMM:string,CT10:string,CT12:string,CTCB10:string,HD_2012:bigint,HD_NAME:string,HOUSING10:bigint,LA_FIP10:string,OBJECTID:bigint,POP_2010:bi