In [2]:
from pyspark.sql import SparkSession
from sedona.spark import SedonaContext
from pyspark.sql.functions import col, sum as spark_sum, regexp_replace, round

# Create SparkSession
spark = SparkSession.builder \
    .appName("Query3") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

sedona = SedonaContext.create(spark)

# Φόρτωση GeoJSON αρχείου (2010 Census Blocks)
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(geojson_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

# Έλεγχος τύπων δεδομένων του geojson
print("Schema of GeoJSON data:")
blocks_df.printSchema()

# Επεξεργασία και μορφοποίηση των δεδομένων από το GeoJSON
flattened_df = blocks_df.select(
    [col(f"properties.{col_name}").alias(col_name) for col_name in 
     blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]
).drop("properties").drop("type")

# Φιλτράρισμα για "Los Angeles" στο πεδίο CITY
la_census_df = flattened_df.filter(col("CITY") == "Los Angeles").select("ZCTA10", "HOUSING10", "POP_2010", "COMM")
la_census_cleaned = la_census_df.filter(col("ZCTA10") != " ")

# Έλεγχος τύπων δεδομένων για το la_census_cleaned
print("Schema of Census Data:")
la_census_cleaned.printSchema()

# Ομαδοποίηση ανά ZCTA10 και COMM και άθροιση των πληθυσμών και κατοικιών
merged_df = la_census_cleaned.groupBy("ZCTA10", "COMM").agg(
    spark_sum("HOUSING10").alias("Total Housing"),
    spark_sum("POP_2010").alias("Total Population")
)

# Έλεγχος τύπων δεδομένων για το merged_df
print("Schema of Merged Data (Housing & Population Summed):")
merged_df.printSchema()

# Φόρτωση του δεύτερου dataset με το median income
income_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
income_df = spark.read.csv(income_data_path, header=True, inferSchema=True)

# Έλεγχος τύπων δεδομένων για το income_df
print("Schema of Income Data:")
income_df.printSchema()

# Προσαρμογή του dataset 2 ώστε να έχουμε μόνο τα πεδία "Zip Code" και "Estimated Median Income"
income_df_cleaned = income_df.select(col("Zip Code"), col("Estimated Median Income"))

# Έλεγχος τύπων δεδομένων για το income_df_cleaned
print("Schema of Cleaned Income Data:")
income_df_cleaned.printSchema()

# Καθαρισμός των δεδομένων income για να αφαιρέσουμε το σύμβολο $ και τα κόμματα
merged_df_with_income = merged_df.join(
    income_df_cleaned, 
    merged_df["ZCTA10"] == income_df_cleaned["Zip Code"], 
    "left"
).drop("Zip Code")

# Καθαρισμός του Estimated Median Income: αφαίρεση του "$" και των κόμματα
merged_df_with_income = merged_df_with_income.withColumn(
    "Estimated Median Income", 
    regexp_replace(col("Estimated Median Income"), r"[$,]", "").cast("double")  # Removes $, commas, and converts to double
)

# Έλεγχος τύπων δεδομένων για το merged_df_with_income μετά τον καθαρισμό
print("Schema of Merged Data with Income:")
merged_df_with_income.printSchema()

# Υπολογισμός του total income για κάθε row (median income * total housing)
merged_df_with_income = merged_df_with_income.withColumn(
    "Total Income", 
    col("Estimated Median Income") * col("Total Housing")
)

# Ομαδοποίηση ανά COMM και υπολογισμός του συνολικού income και πληθυσμού
final_result = merged_df_with_income.groupBy("COMM").agg(
    spark_sum("Total Income").alias("Total Income"),
    spark_sum("Total Population").alias("Total Population")
)

# Υπολογισμός του median income per person για κάθε COMM
final_result = final_result.withColumn(
    "Median Income Per Person", 
    col("Total Income") / col("Total Population")
)

# Round the values to 5 decimals
final_result = final_result.withColumn(
    "Total Income", round(col("Total Income"), 5)
).withColumn(
    "Median Income Per Person", round(col("Median Income Per Person"), 5)
)

# Εμφάνιση αποτελεσμάτων
final_result.show(truncate=False)

# Έλεγχος τύπων αποτελέσματος
print("Schema of Final Result:")
final_result.printSchema()

# Αποθήκευση των αποτελεσμάτων σε αρχείο CSV
output_path = "s3://groups-bucket-dblab-905418150721/group7/q3_results"  # Replace with your desired path
final_result.write \
    .option("header", True) \
    .mode("overwrite") \
    .csv(f"{output_path}/median_income_per_person")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Schema of GeoJSON data:
root
 |-- geometry: geometry (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- BG10: string (nullable = true)
 |    |-- BG10FIP10: string (nullable = true)
 |    |-- BG12: string (nullable = true)
 |    |-- CB10: string (nullable = true)
 |    |-- CEN_FIP13: string (nullable = true)
 |    |-- CITY: string (nullable = true)
 |    |-- CITYCOM: string (nullable = true)
 |    |-- COMM: string (nullable = true)
 |    |-- CT10: string (nullable = true)
 |    |-- CT12: string (nullable = true)
 |    |-- CTCB10: string (nullable = true)
 |    |-- HD_2012: long (nullable = true)
 |    |-- HD_NAME: string (nullable = true)
 |    |-- HOUSING10: long (nullable = true)
 |    |-- LA_FIP10: string (nullable = true)
 |    |-- OBJECTID: long (nullable = true)
 |    |-- POP_2010: long (nullable = true)
 |    |-- PUMA10: string (nullable = true)
 |    |-- SPA_2012: long (nullable = true)
 |    |-- SPA_NAME: string (nullable = true)
 |    |-- SUP_DIST: string (n

In [3]:
from sedona.spark import *
from pyspark.sql.functions import col, count, sum
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
    .appName("Crime Data Analysis with GeoJSON") \
    .getOrCreate()

# Create Sedona Context
sedona = SedonaContext.create(spark)

# Load GeoJSON dataset
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Flatten GeoJSON properties
flattened_df = blocks_df.select(
    [col(f"properties.{col_name}").alias(col_name) for col_name in
     blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]
).drop("properties").drop("type")

#Specify LA city
flattened_df = flattened_df.filter(col("CITY") == "Los Angeles")

# Aggregate population by COMM to ensure no duplicates
agg_population_df = flattened_df.groupBy("COMM").agg(
    sum("POP_2010").alias("total_population"),  # Sum population for each COMM
    ST_Union_Aggr("geometry").alias("geometry")  # Combine geometries for each COMM
)

# Load crime datasets
crime_data_path_1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
crime_data_path_2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"

# Read both datasets
data_1 = spark.read.csv(crime_data_path_1, header=True, inferSchema=True)
data_2 = spark.read.csv(crime_data_path_2, header=True, inferSchema=True)

# Standardize column names (trim spaces)
data_1 = data_1.select([col(c).alias(c.strip()) for c in data_1.columns])
data_2 = data_2.select([col(c).alias(c.strip()) for c in data_2.columns])

# Combine crime datasets into one
combined_crime_df = data_1.unionByName(data_2)

# Ensure geometry points from crime data
combined_crime_df = combined_crime_df.withColumn(
    "geom", ST_Point(col("LON"), col("LAT"))
).dropna(subset=["LON", "LAT"])  # Drop rows with missing coordinates

# Spatial join to associate crimes with aggregated COMM data
joined_df = combined_crime_df.join(agg_population_df, ST_Within(combined_crime_df.geom, agg_population_df.geometry), "inner")

# Aggregate crime counts per COMM
crime_counts = joined_df.groupBy("COMM").agg(count("DR_NO").alias("crime_count"))

# Combine crime counts with aggregated population
pop_with_crime = crime_counts.join(agg_population_df, "COMM", "inner")
pop_with_crime = pop_with_crime.withColumn(
    "crimes_per_person", col("crime_count") / col("total_population")
)

# Registering geometry column for manual WKT conversion
pop_with_crime.createOrReplaceTempView("pop_with_crime")

# Exclude geometry column if WKT conversion is not possible
final_result2 = pop_with_crime.filter(col("total_population") > 0) \
    .select("COMM", "crime_count", "total_population", "crimes_per_person")

# Save without geometry
output_path = "s3://groups-bucket-dblab-905418150721/group7/q3_results"
final_result2.write \
    .option("header", True) \
    .mode("overwrite") \
    .csv(f"{output_path}/crimes_per_person")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Join the two DataFrames on the "COMM" column
final_combined_result = final_result.join(
    final_result2,
    on="COMM",
    how="inner"
)

# Add a new column to calculate the ratio of crimes per person
final_combined_result = final_combined_result.withColumn(
    "Crimes Per Person Ratio",
    round(col("crime_count") / col("total_population"), 5)
)

# Round other numerical columns to 5 decimals for clarity
final_combined_result = final_combined_result.withColumn(
    "Total Income", round(col("Total Income"), 5)
).withColumn(
    "Median Income Per Person", round(col("Median Income Per Person"), 5)
).withColumn(
    "Crimes Per Person Ratio", round(col("Crimes Per Person Ratio"), 5)
)

# Show the final DataFrame
final_combined_result = final_combined_result.select(
    "COMM",
    "Total Income",
    "Total Population",
    "Median Income Per Person",
    "crime_count",
    "Crimes Per Person Ratio"
)

final_combined_result.show(truncate=False)

# Optional: Save the final DataFrame as a CSV file for further analysis
output_path = "s3://groups-bucket-dblab-905418150721/group7/q3_results"
final_combined_result.write \
    .option("header", True) \
    .mode("overwrite") \
    .csv(f"{output_path}/final_combined_results")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+-------------+----------------+------------------------+-----------+-----------------------+
|COMM              |Total Income |Total Population|Median Income Per Person|crime_count|Crimes Per Person Ratio|
+------------------+-------------+----------------+------------------------+-----------+-----------------------+
|Elysian Park      |7.3060257E7  |5267            |13871.32276             |5714       |1.08487                |
|Longwood          |5.649842E7   |4210            |13420.05226             |3074       |0.73017                |
|Cadillac-Corning  |1.3045261E8  |6665            |19572.7847              |4445       |0.66692                |
|Mid-city          |3.11653132E8 |14339           |21734.649               |11574      |0.80717                |
|Lincoln Heights   |3.39536086E8 |31144           |10902.13479             |19391      |0.62262                |
|Van Nuys          |1.246259577E9|86019           |14488.18955             |78883      |0.91704 