In [0]:
%run "./00 - Setup"

In [0]:
from pyspark.sql.functions import expr,col

cities_df = spark.read.format("arcgis") \
.option("url", "https://services1.arcgis.com/Hp6G80Pky0om7QvQ/arcgis/rest/services/Colleges_and_Universities/FeatureServer/0") \
.load() 

cities_df = cities_df.withColumn("geometry", expr("st_aswkt(ST_GeomFromGeoJSON(geometry))"))\
  .withColumn("name", col("properties").getItem("NAME"))\
  .withColumn("native", expr("ST_GeomFromGeoJSON(geometry)"))

cities_df.drop("type", "properties").display()

In [0]:
from pyspark.sql.functions import col, expr, row_number
from pyspark.sql.window import Window

ufo_sightings_df = spark.read.table(f"{catalog}.{database}.ufo_sightings")

search_radius_km = 250  # Example: 10 km radius

# Ensure both DataFrames have H3 cell IDs
ufo_sightings_df = ufo_sightings_df.withColumn("h3_ufo", expr("H3_POINTASH3(geometry, 5)")).alias("ufo")
cities_df = cities_df.withColumnRenamed("id", "airport_id") \
  .withColumn("h3_airport", expr("H3_POINTASH3(geometry, 5)")).alias("airports")

joined_df = ufo_sightings_df \
  .join(cities_df, expr(f"h3_ufo == h3_airport AND st_distancesphere(ufo.geometry, airports.geometry) <= {search_radius_km}"), "left") \
  .withColumn("distance_airport", expr("st_distancesphere(ufo.geometry, airports.geometry)")) \
  .withColumn("line_string", expr("st_astext(st_makeline(array(st_setsrid(st_geomfromtext(ufo.geometry), 4326), st_setsrid(st_geomfromtext(airports.geometry), 4326))))"))

joined_df = joined_df.where(col("airport_id").isNotNull())

In [0]:
from pyspark.sql.functions import col, expr, row_number, when
from pyspark.sql.window import Window

# Window specification to get the closest airport
window_spec = Window.partitionBy("id").orderBy(col("distance_airport"))

# Add row number to each row within the window
ranked_df = joined_df.withColumn("row_number", row_number().over(window_spec))

# Filter to keep only the closest airport
ranked_df.filter(col("row_number") == 1).drop("row_number", "properties").count()


In [0]:
ufo_sightings_df.count()