In [0]:
continents = {
    "Africa": [
        "South Africa","Morocco","Egypt","Kenya","Nigeria","Ghana","Senegal","Tunisia",
        "Algeria","Ethiopia","Uganda","Tanzania","Rwanda","Zimbabwe","Cameroon","Namibia",
        "Botswana","Zambia","Malawi","Lesotho","Liberia","Sierra Leone","Gambia","Sudan",
        "South Sudan","Niger","Chad","Congo","Democratic Republic of the Congo",
        "Burkina Faso","Benin","Togo","Guinea","Guinea-Bissau","Gabon","Mali",
        "Central African Republic","Libya","Somalia","Djibouti","Equatorial Guinea",
        "Mauritius","Seychelles","Cabo Verde","São Tomé & Príncipe","Mayotte"
    ],

    "Asia": [
        "India","Thailand","South Korea","Japan","Turkey","Vietnam","Malaysia",
        "Philippines","Sri Lanka","Pakistan","Nepal","Bangladesh","Indonesia",
        "Cambodia","Laos","Myanmar","Afghanistan","Hong Kong","Taiwan","Singapore",
        "Mongolia","Kazakhstan","Uzbekistan","Kyrgyzstan","Tajikistan","Armenia",
        "Azerbaijan","Georgia","Russia","Israel","Saudi Arabia",
        "United Arab Emirates","Jordan","Lebanon","Iraq","Kuwait","Qatar","Bahrain",
        "Oman","Palestinian Territories","Timor-Leste"
    ],

    "Europe": [
        "France","Italy","Spain","United Kingdom","Germany","Greece","Croatia",
        "Portugal","Poland","Norway","Sweden","Denmark","Netherlands","Switzerland",
        "Austria","Belgium","Ireland","Romania","Czechia","Finland","Hungary",
        "Slovakia","Slovenia","Bulgaria","Serbia","Ukraine","Latvia","Lithuania",
        "Estonia","Montenegro","Albania","Bosnia & Herzegovina","North Macedonia",
        "Malta","Luxembourg","Iceland","Andorra","San Marino","Monaco","Kosovo",
        "Belarus","Moldova","Liechtenstein","Cyprus"
    ],

    "North America": [
        "United States","Canada","Mexico","Costa Rica","Dominican Republic",
        "Guatemala","Panama","El Salvador","Honduras","Nicaragua","Cuba","Jamaica",
        "Haiti","Bahamas","Barbados","Trinidad & Tobago","Belize","Grenada","Dominica",
        "St Lucia","St Vincent & Grenadines","St Kitts & Nevis","Antigua & Barbuda",
        "Puerto Rico","US Virgin Islands","British Virgin Islands","Cayman Islands",
        "Turks & Caicos Islands","Bermuda","Greenland"
    ],

    "South America": [
        "Brazil","Colombia","Argentina","Chile","Peru","Ecuador","Uruguay",
        "Bolivia","Venezuela","Paraguay","Suriname","Guyana","French Guiana"
    ],

    "Oceania": [
        "Australia","New Zealand","Fiji","Samoa","Tonga","Vanuatu","Solomon Islands",
        "Micronesia","Kiribati","Tuvalu","Niue","Palau","Cook Islands","Norfolk Island",
        "New Caledonia","French Polynesia","Wallis & Futuna","Christmas Island"
    ]
}


In [0]:
osm_df = spark.read.parquet("dbfs:/FileStore/osm/south_america_pois_enriched.parquet")


In [0]:
display(osm_df)

In [0]:
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.functions import col, split, trim, lower, size

def airbnb_select(airbnb):
    airbnb_sel = airbnb.select(
    "property_id",
    "listing_name",
    "listing_title",

    "lat",
    "long",
    "location",

    "ratings",
    "reviews",
    "property_number_of_reviews",

    "host_rating",
    "host_number_of_reviews",
    "host_response_rate",
    "hosts_year",
    "is_supperhost",
    "is_guest_favorite",

    "guests",

    "category",
    "category_rating",

    "amenities",
    "description",
    "description_items",
    "details",
    "arrangement_details",

    "pricing_details",
    "total_price",
    "currency",
    "discount",

    "availability",
    "final_url"
    )
    return airbnb_sel

def airbnb_clean(airbnb_sel):
    return airbnb_sel.withColumn("lat", col("lat").cast("double")).withColumn("long", col("long").cast("double")).filter(col("lat").isNotNull() & col("long").isNotNull())\
    .withColumn("ratings", col("ratings").cast("double")).withColumn("property_number_of_reviews",col("property_number_of_reviews").cast("int")).withColumn("host_rating", col("host_rating").cast("double")).withColumn("host_number_of_reviews", col("host_number_of_reviews").cast("int")).withColumn("host_response_rate",col("host_response_rate").cast("double")).withColumn("hosts_year", col("hosts_year").cast("int")).withColumn("total_price",col("total_price").cast("double")).withColumn("guests", col("guests").cast("int"))\
        .withColumn("is_supperhost", (col("is_supperhost") == "true").cast("int")).withColumn("is_guest_favorite", (col("is_guest_favorite") == "true").cast("int")).withColumn("is_available",(col("availability") == "true").cast("int"))\
            .withColumn("city", trim(split(col("location"), ",").getItem(0))).withColumn("country", trim(split(col("location"), ",").getItem(size(split(col("location"), ",")) - 1)))\
        .dropDuplicates(["property_id"])

    
def airbnb_load():
    storage_account = "lab94290"  
    container = "airbnb"
    sas_token="sp=rle&st=2025-12-24T17:37:04Z&se=2026-02-28T01:52:04Z&spr=https&sv=2024-11-04&sr=c&sig=a0lx%2BS6PuS%2FvJ9Tbt4NKdCJHLE9d1Y1D6vpE1WKFQtk%3D"
    sas_token = sas_token.lstrip('?')
    spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "SAS")
    spark.conf.set(f"fs.azure.sas.token.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
    spark.conf.set(f"fs.azure.sas.fixed.token.{storage_account}.dfs.core.windows.net", sas_token)
    path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/airbnb_1_12_parquet"
    airbnb = spark.read.parquet(path)
    airbnb_sel = airbnb_select(airbnb)
    airbnb_clean_df = airbnb_clean(airbnb_sel)
    return airbnb_clean_df

airbnb=airbnb_load()
display(airbnb)




In [0]:
osm_df.select("osm_id").distinct().count(), osm_df.count()

In [0]:
osm_sel=osm_df.select("lat","lon","poi_group")
osm_sel.select("lat","lon").summary().show()

In [0]:
from pyspark.sql import functions as F


R_M = 500.0
EARTH_R = 6371000.0
DELTA_LAT = R_M / 111000.0


airbnb_scope = (
    airbnb
    .filter(F.col("country").isin(countries["South America"]))
    .select(
        "property_id",
        F.col("lat").cast("double").alias("lat"),
        F.col("long").cast("double").alias("lon"),
        "country"
    )
    .filter(F.col("lat").isNotNull() & F.col("lon").isNotNull())
)

# OSM POIs 
osm_geo = (
    osm_df
    .select(
        F.col("lat").cast("double").alias("p_lat"),
        F.col("lon").cast("double").alias("p_lon"),
        F.lower(F.trim(F.col("poi_group"))).alias("group")   # single string
    )
    .filter(
        F.col("p_lat").isNotNull() &
        F.col("p_lon").isNotNull() &
        F.col("group").isNotNull() &
        (F.col("group") != "")
    )
)

a = airbnb_scope.alias("a")
p = osm_geo.alias("p")

# longitude delta depends on latitude 
delta_lon_expr = (R_M / (111000.0 * F.cos(F.radians(F.col("a.lat")))))


# Candidate bbox join

cand = a.join(
    p,
    (F.col("p.p_lat").between(F.col("a.lat") - DELTA_LAT, F.col("a.lat") + DELTA_LAT)) &
    (F.col("p.p_lon").between(F.col("a.lon") - delta_lon_expr, F.col("a.lon") + delta_lon_expr)),
    how="inner"
)


# Exact haversine + radius filter

dist_m = 2 * EARTH_R * F.asin(F.sqrt(
    F.pow(F.sin((F.radians(F.col("p.p_lat") - F.col("a.lat"))) / 2), 2) +
    F.cos(F.radians(F.col("a.lat"))) * F.cos(F.radians(F.col("p.p_lat"))) *
    F.pow(F.sin((F.radians(F.col("p.p_lon") - F.col("a.lon"))) / 2), 2)
))

cand = (
    cand
    .withColumn("distance_m", dist_m)
    .filter(F.col("distance_m") <= R_M)
)

# -----------------------
# Aggregate counts per listing + group
# -----------------------
per_group = (
    cand
    .groupBy(F.col("a.property_id").alias("property_id"), F.col("p.group").alias("group"))
    .agg(F.count("*").cast("int").alias("n_places"))
)

# Build JSON map: {"cafes":{"n_places":12}, ...}
env_json = (
    per_group
    .withColumn("val", F.struct(F.col("n_places").alias("n_places")))
    .groupBy("property_id")
    .agg(F.map_from_entries(F.collect_list(F.struct(F.col("group"), F.col("val")))).alias("env_group_map"))
    .withColumn("env_group_json", F.to_json(F.col("env_group_map")))
    .select("property_id", "env_group_json")
)
display(env_json)
# # -----------------------
# # Join back (adds column; doesn't filter airbnb_clean)
# # -----------------------
# airbnb_enriched_osm = (
#     airbnb_clean
#     .join(env_json, on="property_id", how="left")
#     .withColumn("env_group_json", F.coalesce(F.col("env_group_json"), F.lit("{}")))
# )

# display(airbnb_enriched_osm.select("property_id", "country", "env_group_json").limit(30))
