## AirBnD Data load and explore 

In [0]:
storage_account = "lab94290"  
container = "airbnb"

In [0]:
sas_token="sp=rle&st=2025-12-24T17:37:04Z&se=2026-02-28T01:52:04Z&spr=https&sv=2024-11-04&sr=c&sig=a0lx%2BS6PuS%2FvJ9Tbt4NKdCJHLE9d1Y1D6vpE1WKFQtk%3D"
sas_token = sas_token.lstrip('?')
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{storage_account}.dfs.core.windows.net", sas_token)

In [0]:
path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/airbnb_1_12_parquet"

airbnb = spark.read.parquet(path)
display(airbnb.limit(5))

In [0]:
airbnb.printSchema()

In [0]:
airbnb.count()


In [0]:
from pyspark.sql.functions import col, count, when

missing_stats = airbnb.select([
    count(when(col(c).isNull() | (col(c) == ""), c)).alias(c)
    for c in ["lat", "long"]
])

missing_stats.show(truncate=False)


In [0]:
airbnb.count(), airbnb.select("property_id").distinct().count()


#### select cols we need and cast

In [0]:
from pyspark.sql.functions import col, regexp_replace

airbnb_sel = airbnb.select(
    "property_id",
    "listing_name",
    "listing_title",

    "lat",
    "long",
    "location",

    "ratings",
    "reviews",
    "property_number_of_reviews",

    "host_rating",
    "host_number_of_reviews",
    "host_response_rate",
    "hosts_year",
    "is_supperhost",
    "is_guest_favorite",

    "guests",

    "category",
    "category_rating",

    "amenities",
    "description",
    "description_items",
    "details",
    "arrangement_details",

    "pricing_details",
    "total_price",
    "currency",
    "discount",

    "availability",
    "final_url"
)
display(airbnb_sel.limit(10))

In [0]:
from pyspark.sql.functions import col

airbnb_sel.withColumn("is_available",(col("availability") == "true").cast("int")).groupBy("is_available").count().orderBy("is_available").show()


In [0]:
from pyspark.sql.functions import col, split, trim, lower, size
airbnb_clean=airbnb_sel.withColumn("lat", col("lat").cast("double")).withColumn("long", col("long").cast("double")).filter(col("lat").isNotNull() & col("long").isNotNull())\
    .withColumn("ratings", col("ratings").cast("double")).withColumn("property_number_of_reviews",col("property_number_of_reviews").cast("int")).withColumn("host_rating", col("host_rating").cast("double")).withColumn("host_number_of_reviews", col("host_number_of_reviews").cast("int")).withColumn("host_response_rate",col("host_response_rate").cast("double")).withColumn("hosts_year", col("hosts_year").cast("int")).withColumn("total_price",col("total_price").cast("double")).withColumn("guests", col("guests").cast("int"))\
        .withColumn("is_supperhost", (col("is_supperhost") == "true").cast("int")).withColumn("is_guest_favorite", (col("is_guest_favorite") == "true").cast("int")).withColumn("is_available",(col("availability") == "true").cast("int"))\
            .withColumn("city", trim(split(col("location"), ",").getItem(0))).withColumn("country", trim(split(col("location"), ",").getItem(size(split(col("location"), ",")) - 1)))\
        .dropDuplicates(["property_id"])

In [0]:
airbnb_clean.select("lat", "long","ratings", "host_rating","host_response_rate","total_price","guests").summary().show()


In [0]:
airbnb_clean.printSchema()

In [0]:
display(airbnb_clean.limit(5))

In [0]:
from pyspark.sql.functions import col, count

country_counts = (
    airbnb_clean
    .filter(col("country").isNotNull() & (col("country") != ""))
    .groupBy("country")
    .agg(count("*").alias("n_listings"))
    .orderBy(col("n_listings").desc())
)

display(country_counts)


Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import col, count

city_counts = (
    airbnb_clean
    .filter(col("city").isNotNull() & (col("city") != ""))
    .groupBy("city")
    .agg(count("*").alias("n_listings"))
    .orderBy(col("n_listings").desc())
)

display(city_counts)


imputation missing vals

In [0]:
missing_df = (
    airbnb_clean.select([
        count(when(col(c).isNull(), c)).alias(c)
        for c in airbnb_clean.columns
    ])
)

display(missing_df)


In [0]:
from pyspark.sql import functions as F

missing_summary = (
    airbnb_clean.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in airbnb_clean.columns
    ])
)

total_count = airbnb_clean.count()
missing_pct = (

    missing_summary
    .select([
        (F.col(c).cast('float') / total_count).alias(c)
        for c in airbnb_clean.columns
    ])
)

display(missing_pct)


In [0]:
from pyspark.sql import functions as F


miss_sample = missing_df.sample(fraction=0.002, seed=42).limit(3000)
miss_pd = miss_sample.toPandas()
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
plt.figure(figsize=(14, 6))
sns.heatmap(
    miss_pd.T if not miss_pd.empty else pd.DataFrame(),
    cmap="viridis",
    cbar=False
)
plt.xlabel("Sampled listings")
plt.ylabel("Features")
plt.title("Missingness Heatmap (Airbnb Features)")
plt.tight_layout()
plt.show()


In [0]:
missing_pct = (
    airbnb_clean.select([
        (F.count(F.when(F.col(c).isNull(), c)) / F.count(F.lit(1))).alias(c)
        for c in airbnb_clean.columns
    ]).toPandas().T.rename(columns={0: "missing_pct"}).sort_values("missing_pct", ascending=False))

plt.figure(figsize=(8, 6))
missing_pct.plot(kind="barh", legend=False)
plt.xlabel("Missing fraction")
plt.title("Fraction of Missing Values per Feature")
plt.tight_layout()
plt.show()


## Yelp Data Load and explore

In [0]:

yelp_df  = spark.read.parquet("dbfs:/yelp/yelp_business.parquet")
yelp_df.printSchema()

In [0]:
display(yelp_df)

In [0]:
from pyspark.sql.functions import col
yelp_df.select(col("latitude"), col("longitude")).describe().show()
print(yelp_df.count())

In [0]:
yelp_df.groupBy("is_open").count().orderBy("is_open").show()


In [0]:
yelp_df.count(), yelp_df.select("business_id").distinct().count()


In [0]:
from pyspark.sql.functions import col, lower

yelp_clean = (yelp_df.select("business_id","name","categories","city","state","latitude","longitude","stars","review_count","is_open","attributes"))


In [0]:
yelp_clean.select("stars","review_count",).summary().show()

In [0]:
from pyspark.sql.functions import col

yelp_by_state = (
    yelp_clean
    .filter(col("state").isNotNull())
    .groupBy("state")
    .count()
    .withColumnRenamed("count", "n_businesses")
)
display(yelp_by_state)

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql import functions as F

yelp_cats = (
    yelp_clean.select("business_id", "categories") .withColumn("categories_norm", F.regexp_replace(F.coalesce(F.col("categories"), F.lit("")), r"\s*,\s*", ",")).withColumn("cat", F.explode(F.split(F.col("categories_norm"), ","))).withColumn("cat", F.trim(F.col("cat")))
    .filter(F.col("cat") != "")  
)

# Count categories 
cat_counts = (yelp_cats.groupBy("cat").agg(F.count("*").alias("n_rows"),F.countDistinct("business_id").alias("n_businesses")  ).orderBy(F.desc("n_businesses")))

display(cat_counts)


In [0]:
from pyspark.sql import functions as F

# Base Yelp
yelp0 = yelp_clean

#explode raw categories into rows
yelp_exploded = (yelp0
    .withColumn(
        "categories_norm",
        F.regexp_replace(F.coalesce(F.col("categories"), F.lit("")), r"\s*,\s*", ",")
    )
    .withColumn("cat", F.explode(F.split(F.col("categories_norm"), ",")))
    .withColumn("cat", F.trim(F.col("cat")))
    .filter(F.col("cat") != "")
    .withColumn("cat_l", F.lower(F.col("cat")))
)

mapped = (yelp_exploded
    .withColumn(
        "group",
        F.when(F.col("cat_l").rlike(r"(nightlife|bar|bars|pub|lounges?|cocktail|sports bar|beer|brewery|wine|winery|dance club|karaoke|speakeasy|hookah)"),
               F.lit("nightlife"))
         .when(F.col("cat_l").rlike(r"(coffee|tea|cafe|cafes|bubble tea|juice bars?|smoothie|tea room|coffee roaster)"),
               F.lit("cafes"))
         .when(F.col("cat_l").rlike(r"(restaurant|restaurants|food|sandwich|pizza|burger|breakfast|brunch|fast food|deli|delicatessen|bbq|barbeque|steak|seafood|sushi|ramen|noodle|taco|mexican|italian|chinese|thai|vietnamese|indian|mediterranean|middle eastern|greek|korean|cajun|tex-mex|latin american|asian fusion|american \(|comfort food|vegetarian|vegan|halal|kosher|bagel|bakery|dessert|ice cream|cupcake|donut|hot dog|food truck|caterers?|food delivery|salad|soup|gluten-free)"),
               F.lit("food"))
         .when(F.col("cat_l").rlike(r"(parks?|playgrounds?|kids activities|child care|day care|children's museum|baby gear|toy store|amusement park|water park|zoo|aquarium|pet)"),
               F.lit("family_kids"))
         .when(F.col("cat_l").rlike(r"(shopping|grocery|market|farmers market|convenience|drugstore|department store|outlet|fashion|clothing|shoe|jewelry|electronics|sporting goods|books?|bookstore|mag|music & video|gift|flowers|florist|antiques|thrift|vintage|cosmetics & beauty supply)"),
               F.lit("shopping"))
         .when(F.col("cat_l").rlike(r"(health|medical|doctor|dentist|dentistry|hospital|urgent care|optometr|ophthalm|chiropract|physical therapy|dermatolog|nutritionist|diagnostic|therapy|counseling|mental health)"),
               F.lit("health"))
         .when(F.col("cat_l").rlike(r"(beauty|spa|hair|nail|skin care|wax|lash|makeup|tanning|massage|barber|piercing|tattoo)"),
               F.lit("beauty"))
         .when(F.col("cat_l").rlike(r"(gyms?|fitness|active life|yoga|pilates|trainer|boot camp|martial arts|sports clubs?)"),
               F.lit("fitness"))
         .when(F.col("cat_l").rlike(r"(hotel|hotels|travel|tours|resort|bed & breakfast|hostel|vacation rentals)"),
               F.lit("travel"))
         .when(F.col("cat_l").rlike(r"(automotive|auto repair|tires|oil change|gas station|car wash|car rental|towing|auto parts|car dealer|transmission|auto glass|detailing|smog check)"),
               F.lit("automotive"))
         .when(
    F.col("cat_l").rlike(
        r"(library|libraries|bookstore|bookstores|internet cafe|coworking|shared office|study|tutoring|educational services|college|university)"
    ),
    F.lit("study")
)

.when(
    F.col("cat_l").rlike(
        r"(museum|museums|art gallery|art galleries|performing arts|music venue|cinema|theater|theatre|festival|festivals|tour|tours|amusement park|arcade|escape game|escape room|bowling|zoo|aquarium|beach|beaches|hiking|outdoor|recreation center|kids activities)"
    ),
    F.lit("activities")
).when(
    F.col("cat_l").rlike(
        r"(local services|laundry|laundromat|dry cleaning|shipping|mailbox|notary|printing|"
        r"it services|computer repair|mobile phone repair|phone repair|electronics repair|"
        r"telecommunications|internet service|courier|delivery services|"
        r"financial services|insurance|bank|credit union|accountant|accountants|"
        r"legal services|lawyer|lawyers|"
        r"storage|self storage|moving|movers|locksmith|photographers|"
        r"office cleaning|office equipment|shared office)"
    ),
    F.lit("services")
)


         .when(F.col("cat_l").rlike(r"(home services|contractor|plumbing|electrician|hvac|heating|air conditioning|landscaping|home cleaning|office cleaning|window washing|carpet cleaning|roofing|flooring|pest control|handyman|movers|junk removal|painting|security systems|home inspector|kitchen & bath|cabinetry|tiling)"),
               F.lit("home_services"))
         .otherwise(F.lit("other"))
    )
)

# 3) collect multi-groups per business (drop "other" unless you want it)
groups_per_business = (mapped
    .filter(F.col("group") != "other")
    .select("business_id", "group")
    .dropDuplicates(["business_id", "group"])
    .groupBy("business_id")
    .agg(F.collect_set("group").alias("group_categories"))
)

# 4) join back to Yelp
yelp_with_groups = (yelp0
    .join(groups_per_business, "business_id", "left")
    .withColumn(
        "group_categories",
        F.coalesce(F.col("group_categories"), F.array())  # empty array instead of null
    )
)

display(yelp_with_groups.select("business_id","name","categories","group_categories"))


In [0]:
group_dist = (
    yelp_with_groups
        .select(F.explode("group_categories").alias("group"))
        .groupBy("group")
        .count()
        .orderBy(F.desc("count"))
)

display(group_dist)


Databricks visualization. Run in Databricks to view.

##combine data sets

In [0]:
from pyspark.sql import functions as F

R_M = 500.0 
EARTH_R = 6371000.0
DELTA_LAT = R_M / 111000.0  # ~ degrees


airbnb_us = (
    airbnb_clean
    .filter(F.col("country") == "United States")
    .select(
        "property_id",
        F.col("lat").cast("double").alias("lat"),
        F.col("long").cast("double").alias("lon"),
    )
    .filter(F.col("lat").isNotNull() & F.col("lon").isNotNull())
)


yelp_geo = (
    yelp_with_groups
    .select(
        "business_id",
        F.col("latitude").cast("double").alias("y_lat"),
        F.col("longitude").cast("double").alias("y_lon"),
        F.col("stars").cast("double").alias("stars"),
        F.col("review_count").cast("double").alias("review_count"),
        F.coalesce(F.col("group_categories"), F.array()).alias("group_categories"),
    )
    .filter(F.col("y_lat").isNotNull() & F.col("y_lon").isNotNull())
)

a = airbnb_us.alias("a")
y = yelp_geo.alias("y")


delta_lon_expr = R_M / (111000.0 * F.greatest(F.cos(F.radians(F.col("a.lat"))), F.lit(1e-6)))

cand = a.join(
    y,
    (F.col("y.y_lat").between(F.col("a.lat") - DELTA_LAT, F.col("a.lat") + DELTA_LAT))
    & (F.col("y.y_lon").between(F.col("a.lon") - delta_lon_expr, F.col("a.lon") + delta_lon_expr)),
    how="inner",
)


dist_m = 2 * EARTH_R * F.asin(
    F.sqrt(
        F.pow(F.sin((F.radians(F.col("y.y_lat") - F.col("a.lat"))) / 2), 2)
        + F.cos(F.radians(F.col("a.lat")))
        * F.cos(F.radians(F.col("y.y_lat")))
        * F.pow(F.sin((F.radians(F.col("y.y_lon") - F.col("a.lon"))) / 2), 2)
    )
)

cand = (
    cand.withColumn("distance_m", dist_m)
        .filter(F.col("distance_m") <= R_M)
        .select(
            F.col("a.property_id").alias("property_id"),
            F.col("y.group_categories").alias("group_categories"),
            F.col("y.stars").alias("stars"),
            F.col("y.review_count").alias("review_count"),
        )
)

cand_groups = (
    cand
    .withColumn("group", F.explode("group_categories"))
    .filter(F.col("group").isNotNull() & (F.col("group") != ""))
    .select("property_id", "group", "stars", "review_count")
)

per_group = (
    cand_groups
    .groupBy("property_id", "group")
    .agg(
        F.count("*").alias("n_places"),
        F.avg("stars").alias("avg_rating"),
        F.avg("review_count").alias("avg_reviews"),
    )
)

per_group_struct = (
    per_group
    .withColumn(
        "val",
        F.struct(
            F.col("n_places").cast("int").alias("n_places"),
            F.round(F.col("avg_rating"), 3).alias("avg_rating"),
            F.round(F.col("avg_reviews"), 3).alias("avg_reviews"),
        ),
    )
    .select("property_id", "group", "val")
)

env_map = (
    per_group_struct
    .groupBy("property_id")
    .agg(
        F.map_from_entries(F.collect_list(F.struct(F.col("group"), F.col("val")))).alias("env_group_map")
    )
)

env_json = (
    env_map
    .withColumn("env_group_json", F.to_json("env_group_map"))
    .select("property_id", "env_group_json")
)
display(env_json)

# airbnb_enriched = (
#     airbnb_clean
#     .join(env_json, on="property_id", how="left")
#     .withColumn("env_group_json", F.coalesce(F.col("env_group_json"), F.lit("{}")))
# )

# display(airbnb_enriched.select("property_id", "country", "env_group_json").limit(50))


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

R_M = 1000
N   = 30
EARTH_R = 6371000.0

# Your bbox delta
DELTA_LAT = R_M / 111000.0

airbnb_all = airbnb_clean
yelp_sel   = yelp_clean

airbnb_us_for_join = (
    airbnb_all
    .filter(F.col("country") == "United States")
    .select(
        "property_id",
        F.col("lat").cast("double").alias("lat"),
        F.col("long").cast("double").alias("lon")
    )
)

yelp_sel = (
    yelp_sel
    .select(
        "business_id","name","categories","stars","review_count",
        F.col("latitude").cast("double").alias("latitude"),
        F.col("longitude").cast("double").alias("longitude")
    )
    
)

CELL_DEG = DELTA_LAT / 2.0   # half-radius in degrees

airbnb_keyed = (
    airbnb_us_for_join
    .withColumn("cell_x", F.floor(F.col("lon") / F.lit(CELL_DEG)).cast("long"))
    .withColumn("cell_y", F.floor(F.col("lat") / F.lit(CELL_DEG)).cast("long"))
)

yelp_keyed = (
    yelp_sel
    .withColumn("cell_x", F.floor(F.col("longitude") / F.lit(CELL_DEG)).cast("long"))
    .withColumn("cell_y", F.floor(F.col("latitude") / F.lit(CELL_DEG)).cast("long"))
)


neighbors = F.array(*[
    F.struct((F.col("cell_x") + F.lit(dx)).alias("jx"),
             (F.col("cell_y") + F.lit(dy)).alias("jy"))
    for dx in [-2,-1,0,1,2]
    for dy in [-2,-1,0,1,2]
])

airbnb_expanded = (
    airbnb_keyed
    .withColumn("nbr", F.explode(neighbors))
    .select(
        "property_id","lat","lon",
        F.col("nbr.jx").alias("jx"),
        F.col("nbr.jy").alias("jy")
    )
)

cand0 = (
    airbnb_expanded.alias("a")
    .join(
        yelp_keyed.alias("y"),
        (F.col("a.jx") == F.col("y.cell_x")) & (F.col("a.jy") == F.col("y.cell_y")),
        "inner"
    )
)

delta_lon_expr = (R_M / (111000.0 * F.greatest(F.cos(F.radians(F.col("a.lat"))), F.lit(0.01))))

cand = cand0.filter(
    (F.col("y.latitude").between(F.col("a.lat") - F.lit(DELTA_LAT), F.col("a.lat") + F.lit(DELTA_LAT))) &
    (F.col("y.longitude").between(F.col("a.lon") - delta_lon_expr, F.col("a.lon") + delta_lon_expr))
)

dist_m = 2 * F.lit(EARTH_R) * F.asin(F.sqrt(
    F.pow(F.sin(F.radians(F.col("y.latitude") - F.col("a.lat")) / 2), 2) +
    F.cos(F.radians(F.col("a.lat"))) * F.cos(F.radians(F.col("y.latitude"))) *
    F.pow(F.sin(F.radians(F.col("y.longitude") - F.col("a.lon")) / 2), 2)
))

cand = (cand
    .withColumn("distance_m", dist_m)
    .filter(F.col("distance_m") <= F.lit(R_M))
    .select(
        F.col("a.property_id").alias("property_id"),
        F.col("y.business_id").alias("business_id"),
        F.col("y.name").alias("name"),
        F.col("y.categories").alias("categories"),
        F.col("y.stars").alias("stars"),
        F.col("y.review_count").alias("review_count"),
        F.col("distance_m").alias("distance_m")
    )
)

w = Window.partitionBy("property_id").orderBy(F.col("distance_m").asc())
cand_top = cand.withColumn("rn", F.row_number().over(w)).filter(F.col("rn") <= F.lit(N))

nearby_us = cand_top.groupBy("property_id").agg(
    F.collect_list(
        F.struct("business_id","name","categories","stars","review_count","distance_m")
    ).alias("nearby_businesses")
)

display(nearby_us)


In [0]:
display(env_map)

In [0]:
airbnb_with_businesses = (airbnb_all.join(nearby_us, on="property_id", how="left"))
display(airbnb_with_businesses)

In [0]:
from pyspark.sql import functions as F

cand2 = cand.withColumn(
    "bucket",
    F.when(F.col("categories").rlike("(?i)Coffee|Cafes|Tea|Libraries"), F.lit("study"))
     .when(F.col("categories").rlike("(?i)Bars|Nightlife|Clubs"), F.lit("nightlife"))
     .when(F.col("categories").rlike("(?i)Parks|Playgrounds|Kids"), F.lit("family"))
     .otherwise(F.lit("other"))
)
env_bucket = (cand2
  .groupBy("property_id", "bucket")
  .agg(
      F.count("*").alias("cnt"),
      F.avg("stars").alias("avg_stars"),
      F.sum("review_count").alias("sum_reviews")
  )
)
env_wide = (env_bucket
  .groupBy("property_id")
  .pivot("bucket", ["study","nightlife","family","other"])
  .agg(
      F.first("cnt").alias("cnt"),
      F.first("avg_stars").alias("avg_stars"),
      F.first("sum_reviews").alias("sum_reviews")
  )
)
display(env_wide)