In [0]:

osm_df = spark.read.parquet("dbfs:/FileStore/osm/south_america_pois_enriched.parquet")


In [0]:
display(osm_df)

In [0]:
from airbnb_load import airbnb_load
airbnb=airbnb_load()
display(airbnb)




In [0]:
osm_df.select("osm_id").distinct().count(), osm_df.count()

In [0]:
osm_sel=osm_df.select("lat","lon","poi_group")
osm_sel.select("lat","lon").summary().show()

### example of join

In [0]:
south_america= [
        "Brazil","Colombia","Argentina","Chile","Peru","Ecuador","Uruguay",
        "Bolivia","Venezuela","Paraguay","Suriname","Guyana","French Guiana"
    ]

In [0]:
from pyspark.sql import functions as F


R_M = 1000.0
EARTH_R = 6371000.0
DELTA_LAT = R_M / 111000.0


airbnb_scope = (
    airbnb
    .filter(F.col("country").isin(south_america))
    .select(
        "property_id",
        F.col("lat").cast("double").alias("lat"),
        F.col("long").cast("double").alias("lon"),
        "country"
    )
    .filter(F.col("lat").isNotNull() & F.col("lon").isNotNull())
)

# OSM POIs 
osm_geo = (
    osm_df
    .select(
        F.col("lat").cast("double").alias("p_lat"),
        F.col("lon").cast("double").alias("p_lon"),
        F.lower(F.trim(F.col("poi_group"))).alias("group")   # single string
    )
    .filter(
        F.col("p_lat").isNotNull() &
        F.col("p_lon").isNotNull() &
        F.col("group").isNotNull() &
        (F.col("group") != "")
    )
)

a = airbnb_scope.alias("a")
p = osm_geo.alias("p")

# longitude delta depends on latitude 
delta_lon_expr = (R_M / (111000.0 * F.cos(F.radians(F.col("a.lat")))))


# Candidate bbox join

cand = a.join(
    p,
    (F.col("p.p_lat").between(F.col("a.lat") - DELTA_LAT, F.col("a.lat") + DELTA_LAT)) &
    (F.col("p.p_lon").between(F.col("a.lon") - delta_lon_expr, F.col("a.lon") + delta_lon_expr)),
    how="inner"
)



dist_m = 2 * EARTH_R * F.asin(F.sqrt(
    F.pow(F.sin((F.radians(F.col("p.p_lat") - F.col("a.lat"))) / 2), 2) +
    F.cos(F.radians(F.col("a.lat"))) * F.cos(F.radians(F.col("p.p_lat"))) *
    F.pow(F.sin((F.radians(F.col("p.p_lon") - F.col("a.lon"))) / 2), 2)
))

cand = (
    cand
    .withColumn("distance_m", dist_m)
    .filter(F.col("distance_m") <= R_M)
)


per_group = (
    cand
    .groupBy(F.col("a.property_id").alias("property_id"), F.col("p.group").alias("group"))
    .agg(F.count("*").cast("int").alias("n_places"))
)

env_json = (
    per_group
    .withColumn("val", F.struct(F.col("n_places").alias("n_places")))
    .groupBy("property_id")
    .agg(F.map_from_entries(F.collect_list(F.struct(F.col("group"), F.col("val")))).alias("env_group_map"))
    .withColumn("env_group_json", F.to_json(F.col("env_group_map")))
    .select("property_id", "env_group_json")
)
display(env_json)
# 
# airbnb_enriched_osm = (
#     airbnb_clean
#     .join(env_json, on="property_id", how="left")
#     .withColumn("env_group_json", F.coalesce(F.col("env_group_json"), F.lit("{}")))
# )

# display(airbnb_enriched_osm.select("property_id", "country", "env_group_json").limit(30))


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import MapType, StructType, StructField, IntegerType, StringType

val_schema = StructType([StructField("n_places", IntegerType(), True)])

env_totals = (
    env_json
    .withColumn("env_map",F.from_json(F.col("env_group_json"),MapType(StringType(), val_schema))).withColumn("total_places",F.expr("aggregate(map_values(env_map), 0, (acc, x) -> acc + coalesce(x.n_places, 0))")).select("property_id", "total_places"))
display(env_totals)

Databricks visualization. Run in Databricks to view.

In [0]:
env_totals.select("total_places").summary().show()

In [0]:
airbnb_enriched_osm = (airbnb.filter(F.col("country").isin(south_america)).join(env_json, on="property_id", how="left").withColumn("env_group_json", F.coalesce(F.col("env_group_json"), F.lit("{}"))))

In [0]:
ENV_GROUPS =["Sightseeing",
    "Culture",
    "Family",
    "Nightlife",
    "Food",
    "Nature",
    "Transport",
    "Leisure",
    "Shopping",
    "Supplies",
    "Services",
    "Health"]

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import MapType, StructType, StructField, IntegerType, StringType

val_schema = StructType([StructField("n_places", IntegerType(), True)])
env_schema = MapType(StringType(), val_schema)

df = env_json.withColumn("env_map", F.from_json(F.col("env_group_json"), env_schema))

for g in ENV_GROUPS:
    df = df.withColumn(
        f"env_{g}",
        F.coalesce(F.col("env_map")[g]["n_places"], F.lit(0)).cast("double")
    ).withColumn(
        f"env_{g}_log",
        F.log1p(F.col(f"env_{g}"))
    )



## embading 