In [14]:
from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
import time
# Create Spark Session with configurations
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

# Create Sedona context
sedona = SedonaContext.create(spark)

# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_df = blocks_df.select(
    [col(f"properties.{col_name}").alias(col_name) for col_name in
     blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
    .drop("properties") \
    .drop("type")

# Print schema
flattened_df.printSchema()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG10: string (nullable = true)
 |-- BG10FIP10: string (nullable = true)
 |-- BG12: string (nullable = true)
 |-- CB10: string (nullable = true)
 |-- CEN_FIP13: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOM: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- CT10: string (nullable = true)
 |-- CT12: string (nullable = true)
 |-- CTCB10: string (nullable = true)
 |-- HD_2012: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING10: long (nullable = true)
 |-- LA_FIP10: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP_2010: long (nullable = true)
 |-- PUMA10: string (nullable = true)
 |-- SPA_2012: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP_DIST: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: double (nullable = true)
 |-- ShapeSTLength: double (nullable = true)
 |-- ZCTA10: string (nullable = true)
 |-- geometry: geometry (nulla

In [15]:
race_path = 's3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv'
data1_path = 's3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv'
data2_path = 's3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv'
area_path = 's3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv'
df_area = spark.read.csv(area_path, header = True, inferSchema=True)
df_race = spark.read.csv(race_path, header=True, inferSchema=True)
df1 = spark.read.csv(data1_path, header=True, inferSchema=True)
df2 = spark.read.csv(data2_path, header=True, inferSchema=True)
df_combined = df1.union(df2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
from pyspark.sql.functions import col, year, to_timestamp
#from sedona.sql.types import ST_Point
#from sedona.sql.functions import ST_Contains
start_time = time.time()
# Step 1: Filter `df_area` by `Estimated Median Income` in ascending order and get the top 3
result_asc = df_area.orderBy(col("Estimated Median Income").asc())
res1 = result_asc.limit(3).collect()
end_time = time.time()
print(f"Time it took: {end_time-start_time}")
for row in res1:
    print(row["Community"])

# Step 2: Filter `df_area` by `Estimated Median Income` in descending order and get the top 3
start_time = time.time()
result_desc = df_area.orderBy(col("Estimated Median Income").desc())
res2 = result_desc.limit(3).collect()
end_time = time.time()
print(f"Time it took:{end_time - start_time}")
for row in res2:
    print(row["Community"])

# Step 3: Collect the zip codes of the lowest income areas
codes1 = [row["Zip Code"] for row in res1]
for code in codes1:
    print(code)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time it took: 0.3488943576812744
Los Angeles (West Hills), Bell Canyon
Redondo Beach, Torrance
Bouquet Canyon, Green Valley, Mint Canyon, Santa Clarita (Canyon County)
Time it took:0.16186857223510742
Topanga
Los Angeles (Century City)
Long Beach
91307
90277
91350

In [17]:

# Step 4: Filter `flattened_df` for records matching the collected zip codes
start_time = time.time()
flattened_df = flattened_df.filter(col("ZCTA10").isin(codes1))
#res.show(10)

# Step 5: Filter `df_combined` for crimes occurring in 2015
crime_2015 = df_combined.filter(
    year(to_timestamp(col("DATE OCC"), "MM/dd/yyyy hh:mm:ss a")) == 2015
)
end_time = time.time()
print(f"Time it took:{end_time-start_time}")
# Show the filtered crimes
#crime_2015.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time it took:0.013997793197631836

In [18]:
start_time = time.time()
# Step 6: Add a geometry column (`geom`) to `crime_2015` using ST_Point
crime_2015 = crime_2015.withColumn("geom", ST_Point(col("LON"), col("LAT")))
end_time = time.time()
crime_2015.select("Crm Cd", "Crm Cd Desc", "LAT", "LON", "geom").show(20)
print(f"Time it took:{end_time-start_time}")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+--------------------+-------+---------+--------------------+
|Crm Cd|         Crm Cd Desc|    LAT|      LON|                geom|
+------+--------------------+-------+---------+--------------------+
|   946|OTHER MISCELLANEO...|34.0448|-118.2474|POINT (-118.2474 ...|
|   510|    VEHICLE - STOLEN|34.0618|-118.2954|POINT (-118.2954 ...|
|   510|    VEHICLE - STOLEN|33.8202|-118.3037|POINT (-118.3037 ...|
|   510|    VEHICLE - STOLEN|34.0691|-118.2979|POINT (-118.2979 ...|
|   330|BURGLARY FROM VEH...|34.0461|-118.2476|POINT (-118.2476 ...|
|   510|    VEHICLE - STOLEN|33.7469|-118.2835|POINT (-118.2835 ...|
|   510|    VEHICLE - STOLEN|33.9113|-118.2872|POINT (-118.2872 ...|
|   510|    VEHICLE - STOLEN|33.7842|-118.2779|POINT (-118.2779 ...|
|   510|    VEHICLE - STOLEN|34.1974|-118.5994|POINT (-118.5994 ...|
|   330|BURGLARY FROM VEH...|33.9815|-118.2739|POINT (-118.2739 ...|
|   510|    VEHICLE - STOLEN|34.2216|-118.4488|POINT (-118.4488 ...|
|   310|            BURGLARY| 33.9

In [19]:
start_time = time.time()
# Step 7: Perform the spatial join using ST_Contains
result_df = flattened_df.join(
    crime_2015,
    ST_Contains(flattened_df["geometry"], crime_2015["geom"]),
    "inner",  # Keep only the rows where the points are inside the polygons
)
end_time = time.time()
print(f"Time it took{end_time-start_time}")
# Show the spatial join results
#result_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time it took0.014943122863769531

In [20]:
crime_2015.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

215923

In [21]:
result_df.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

932

In [22]:
result_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG10: string (nullable = true)
 |-- BG10FIP10: string (nullable = true)
 |-- BG12: string (nullable = true)
 |-- CB10: string (nullable = true)
 |-- CEN_FIP13: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOM: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- CT10: string (nullable = true)
 |-- CT12: string (nullable = true)
 |-- CTCB10: string (nullable = true)
 |-- HD_2012: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING10: long (nullable = true)
 |-- LA_FIP10: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP_2010: long (nullable = true)
 |-- PUMA10: string (nullable = true)
 |-- SPA_2012: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP_DIST: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: double (nullable = true)
 |-- ShapeSTLength: double (nullable = true)
 |-- ZCTA10: string (nullable = true)
 |-- geometry: geometry (nulla

In [23]:
start_time = time.time()
grouped_df = result_df.groupBy('Vict Descent').count()
end_time = time.time()
print(f"Time it took:{end_time-start_time}")
grouped_df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time it took:0.029976844787597656
+------------+-----+
|Vict Descent|count|
+------------+-----+
|        NULL|   34|
|           B|   56|
|           J|    1|
|           H|  127|
|           F|    2|
|           O|  128|
|           X|   90|
|           W|  470|
|           A|   24|
+------------+-----+

Τα στατιστικά για τις 3 πιο πλούσιες περιοχές.

In [24]:
start_time = time.time()
final_df = grouped_df.join(
    df_race,
    grouped_df['Vict Descent'] == df_race['Vict Descent'],
    "inner")
end_time = time.time()
print(f"Time it took:{end_time-start_time}")
final_df.select('Vict Descent Full',grouped_df['count'].alias('#')).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time it took:0.004296302795410156
+--------------------+---+
|   Vict Descent Full|  #|
+--------------------+---+
|               Black| 56|
|            Japanese|  1|
|Hispanic/Latin/Me...|127|
|            Filipino|  2|
|               Other|128|
|             Unknown| 90|
|               White|470|
|         Other Asian| 24|
+--------------------+---+

In [25]:
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")
# Print schema

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
codes2 = [row["Zip Code"] for row in res2]
for code in codes2:
    print(code)
flattened_df = flattened_df.filter(col("ZCTA10").isin(codes2))
#res.show(10)

start_time = time.time()
result_df = flattened_df.join(
    crime_2015,
    ST_Contains(flattened_df["geometry"], crime_2015["geom"]),
    "inner",  # Keep only the rows where the points are inside the polygons
)
end_time = time.time()
print(f"Time it took:{end_time-start_time}")
#result_df.show()
print(result_df.count())
start_time = time.time()
grouped_df = result_df.groupBy('Vict Descent').count()
end_time = time.time()
print(f"Time it took:{end_time-start_time}")
grouped_df.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

90290
90067
90808
Time it took:0.013457775115966797
97
Time it took:0.008965730667114258
+------------+-----+
|Vict Descent|count|
+------------+-----+
|        NULL|    2|
|           B|    5|
|           H|    8|
|           O|   15|
|           X|    8|
|           W|   57|
|           A|    2|
+------------+-----+

Στατιστικά για τις 3 πιο φτωχές περιοχές

In [27]:
start_time = time.time()
final_df = grouped_df.join(
    df_race,
    grouped_df['Vict Descent'] == df_race['Vict Descent'],
    "inner")
end_time = time.time()
print(f"Time it took:{end_time - start_time}")
final_df.select('Vict Descent Full',grouped_df['count'].alias('#')).show()
#result_df.show(10)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time it took:0.0041294097900390625
+--------------------+---+
|   Vict Descent Full|  #|
+--------------------+---+
|               Black|  5|
|Hispanic/Latin/Me...|  8|
|               Other| 15|
|             Unknown|  8|
|               White| 57|
|         Other Asian|  2|
+--------------------+---+