![logo](/Workspace/Users/nagham.omar@campus.technion.ac.il/VibeBnB/static/logo.png)

Please run all the cells in cluster before starting 



In [0]:
import json, time
import ipywidgets as widgets
from IPython.display import display, clear_output

from pyspark.sql import functions as F
from pyspark.storagelevel import StorageLevel
from pyspark.ml.feature import BucketedRandomProjectionLSHModel
from IPython.display import display, HTML

import time
import ipywidgets as widgets
import ipywidgets as W
from IPython.display import display as ipy_display, clear_output
from pyspark.sql import functions as F
from retrieve_rank import retrieve, order
EMBEDDING_PATH = "dbfs:/vibebnb/data/europe_countries_embedded"
DATA_PATH      = "dbfs:/vibebnb/data/europe_countries_scored.parquet"
LSH_MODEL_PATH = "dbfs:/vibebnb/models/lsh_global"

UI_COLS = [
    "property_id", "addr_cc", "listing_title", "room_type_text",
    "addr_name", "price_per_night", "ratings"
]
# Load once 
df_all = (
    spark.read.parquet(DATA_PATH)
    .dropDuplicates(["property_id"])
    .persist(StorageLevel.MEMORY_AND_DISK)
)

df_emb = (
    spark.read.parquet(EMBEDDING_PATH)
    .select("property_id", "addr_cc", "features_norm")
    .dropDuplicates(["property_id"])
    .persist(StorageLevel.MEMORY_AND_DISK)
)

lsh_model = BucketedRandomProjectionLSHModel.load(LSH_MODEL_PATH)

# Countries for dropdown
countries = [r["addr_cc"] for r in (
    df_all.select(F.upper(F.trim("addr_cc")).alias("addr_cc"))
         .where(F.col("addr_cc").isNotNull() & (F.col("addr_cc") != ""))
         .dropDuplicates(["addr_cc"])
         .orderBy("addr_cc")
         .collect()
)]

# --- Infer env columns (matches your infer_env_embedding_cols idea) ---
def infer_env_cols_from_columns(cols: list[str]) -> list[str]:
    env_norm = [c for c in cols if c.startswith("env_") and c.endswith("_norm")]
    if env_norm:
        return sorted(env_norm)
    env_raw = [c for c in cols if c.startswith("env_") and (not c.endswith("_max")) and (not c.endswith("_norm"))]
    return sorted(env_raw)

ENV_COLS = infer_env_cols_from_columns(df_all.columns)




# One shared output area for HTML tables




## Filter the data to find your reference propriety that you wish to find similar properties to it 


In [0]:

w_country = widgets.Dropdown(
    options=countries,
    description="Country",
    layout=widgets.Layout(width="360px")
)

w_city = widgets.Dropdown(
    options=[],
    description="City",
    layout=widgets.Layout(width="360px")
)

w_rating = widgets.FloatRangeSlider(
    value=[0.0, 5.0],
    min=0.0, max=5.0, step=0.1,
    description="Rating",
    continuous_update=False,
    readout_format=".1f",
    layout=widgets.Layout(width="520px")
)

PRICE_MAX_UI = 1300
w_price = widgets.IntRangeSlider(
    value=[0, PRICE_MAX_UI],
    min=0, max=PRICE_MAX_UI, step=10,
    description="Price",
    continuous_update=False,
    layout=widgets.Layout(width="520px")
)

w_limit = widgets.IntSlider(
    value=300, min=50, max=1000, step=50,
    description="Limit",
    continuous_update=False,
    layout=widgets.Layout(width="360px")
)

btn_filter = widgets.Button(description="Filter & Show", button_style="primary")
out_filter = widgets.Output()

# Keep filtered DF cached for next blocks
STATE = {"filtered_df": None, "filtered_rows": None, "final_topk": None}

def refresh_cities_for_country(cc: str):
    if not cc:
        return []
    city_rows = (
        df_all.where(F.upper(F.trim(F.col("addr_cc"))) == F.lit(cc))
              .select(F.trim(F.col("addr_name")).alias("addr_name"))
              .where(F.col("addr_name").isNotNull() & (F.col("addr_name") != ""))
              .dropDuplicates(["addr_name"])
              .orderBy("addr_name")
              .limit(4000)
              .collect()
    )
    return [r["addr_name"] for r in city_rows if r and r["addr_name"]]

def on_country_change(change):
    if change.get("name") == "value":
        cc = change.get("new")
        cities = refresh_cities_for_country(cc)
        w_city.options = cities
        w_city.value = (cities[0] if cities else None)

w_country.observe(on_country_change)

# Initialize city list
on_country_change({"name": "value", "new": w_country.value})

def apply_filters_df():
    cc = w_country.value
    city = w_city.value
    rmin, rmax = w_rating.value
    pmin, pmax = w_price.value
    limit = int(w_limit.value)

    if not cc:
        raise ValueError("Country is required.")
    if not city:
        raise ValueError("City is required.")

    df = (
        df_all
        .where(F.upper(F.trim(F.col("addr_cc"))) == F.lit(cc))
        .where(F.trim(F.col("addr_name")) == F.lit(city))
        .where(F.col("ratings").isNotNull() & (F.col("ratings") >= F.lit(float(rmin))) & (F.col("ratings") <= F.lit(float(rmax))))
        .where(F.col("price_per_night").isNotNull() & (F.col("price_per_night") >= F.lit(float(pmin))) & (F.col("price_per_night") <= F.lit(float(pmax))))
        .select(*UI_COLS)
        .orderBy(F.desc_nulls_last("ratings"), F.asc_nulls_last("price_per_night"), F.asc("property_id"))
        .limit(limit)
    )
    return df

def on_filter_click(_):
    with out_filter:
        clear_output(wait=True)
        try:
            df = apply_filters_df()
            rows = df.collect()
            STATE["filtered_df"] = df
            STATE["filtered_rows"] = rows

            print(f"Filtered rows: {len(rows)}  | Country={w_country.value} | City={w_city.value}")
            # Use your HTML renderer 
            show_spark_df(
                df,
                n=min(50, len(rows) if rows else 50),
                title="Filtered Listings",
                cols=["property_id","listing_title","addr_cc","addr_name","room_type_text","price_per_night","ratings"]
            )
        except Exception as e:
            print("Error:", str(e))

btn_filter.on_click(on_filter_click)

ipy_display(widgets.VBox([
    widgets.HBox([w_country, w_city]),
    w_rating,
    w_price,
    widgets.HBox([w_limit, btn_filter]),
    out_filter,
    # table output is shown in the global `out` widget
]))
outf = W.Output()
ipy_display(outf)

## After filtring Click "Build listings choices" and choose your reference propriety and targer County and K (thae number of similar listings to return)

In [0]:
out_choose = widgets.Output()

w_ref = widgets.Dropdown(options=[], description="Listing", layout=widgets.Layout(width="900px"))
w_target_country = widgets.Dropdown(options=countries, description="Target CC", layout=widgets.Layout(width="360px"))

w_n_candidates = widgets.Dropdown(options=[25, 50, 100, 200], value=50, description="Candidates", layout=widgets.Layout(width="220px"))
w_k_show = widgets.Dropdown(options=[5, 10, 15, 20], value=10, description="Top-K", layout=widgets.Layout(width="180px"))

btn_build_choices = widgets.Button(description="Build listing choices", button_style="info")

def build_listing_options():
    rows = STATE.get("filtered_rows") or []
    opts = []
    for r in rows:
        d = r.asDict()
        pid = d.get("property_id")
        title = d.get("listing_title") or ""
        city = d.get("addr_name") or ""
        cc = d.get("addr_cc") or ""
        price = d.get("price_per_night")
        rating = d.get("ratings")
        label = f"{title} — {city} ({cc}) | €{price} | ⭐ {rating} | id={pid}"
        opts.append((label, str(pid)))
    return opts

def on_build(_):
    with out_choose:
        clear_output()
        opts = build_listing_options()
        if not opts:
            print("No filtered rows yet. Run the filter block first.")
            return
        w_ref.options = opts
        w_ref.value = opts[0][1]
        print("Listing options ready.")

btn_build_choices.on_click(on_build)

display(widgets.VBox([
    widgets.HBox([btn_build_choices, w_target_country]),
    w_ref,
    widgets.HBox([w_n_candidates, w_k_show]),
    out_choose
]))


## Chosse your Preferences then click "Run Recommend" and wait

In [0]:
# Core weights
w_price_w    = widgets.IntSlider(value=25, min=0, max=100, step=5, description="w_price",    continuous_update=False)
w_property_w = widgets.IntSlider(value=25, min=0, max=100, step=5, description="w_property", continuous_update=False)
w_host_w     = widgets.IntSlider(value=25, min=0, max=100, step=5, description="w_host",     continuous_update=False)

# Optional context
w_temp_w   = widgets.IntSlider(value=0, min=0, max=100, step=5, description="w_temp",   continuous_update=False)
w_budget_w = widgets.IntSlider(value=0, min=0, max=100, step=5, description="w_budget", continuous_update=False)

w_temp_pref = widgets.FloatText(value=22.0, description="temp_pref")
w_month     = widgets.IntSlider(value=7, min=1, max=12, step=1, description="month", continuous_update=False)
w_budget_pref = widgets.Dropdown(options=["", "low", "mid", "high"], value="", description="budget_pref")

# ENV sliders (use ENV_COLS as keys!)
def pretty_env_name(col: str) -> str:
    x = col.replace("env_", "").replace("_norm", "")
    return x.replace("_", " ").title()

env_sliders = {}
for c in ENV_COLS:
    env_sliders[c] = widgets.IntSlider(
        value=0, min=0, max=100, step=5,
        description=pretty_env_name(c),
        continuous_update=False,
        layout=widgets.Layout(width="520px"),
        style={"description_width": "170px"}
    )

env_box = widgets.VBox(list(env_sliders.values())) if env_sliders else widgets.HTML("<i>No env_* columns found.</i>")

display(widgets.VBox([
    widgets.HTML("<h3>Preferences</h3>"),
    widgets.HBox([w_price_w, w_property_w, w_host_w]),
    widgets.HBox([w_temp_w, w_budget_w]),
    widgets.HBox([w_temp_pref, w_month, w_budget_pref]),
    widgets.HTML("<h4>Neighborhood vibe (env)</h4>"),
    env_box
]))


In [0]:
btn_run = widgets.Button(description="Run Recommend", button_style="success")
out_run = widgets.Output()


def show_spark_df(df, n=50, title=None, cols=None):
    # pick columns (avoid huge structs/vectors)
    if cols:
        df = df.select(*cols)

    rows = df.limit(n).collect()
    colnames = df.columns if not cols else cols

    # build HTML
    def esc(x):
        s = "" if x is None else str(x)
        return (s.replace("&","&amp;").replace("<","&lt;").replace(">","&gt;"))

    header = "".join(
        f"<th style='text-align:left;padding:8px;border-bottom:1px solid rgba(255,255,255,.12);color:#e9f1ff;background:#0b2340;position:sticky;top:0;'>"
        f"{esc(c)}</th>"
        for c in colnames
    )
    body = ""
    for r in rows:
        body += "<tr>" + "".join(
            f"<td style='padding:8px;border-bottom:1px solid rgba(255,255,255,.08);color:#e9f1ff;'>"
            f"{esc(r[c])}</td>" for c in colnames
        ) + "</tr>"

    html = f"""
    <div style="overflow:auto; max-height:520px; border:1px solid rgba(255,255,255,.14);
                border-radius:14px; background:rgba(7,26,43,.65); box-shadow:0 10px 22px rgba(0,0,0,.35);">
      <table style="border-collapse:collapse; width:100%; font-family:system-ui; font-size:13px;">
        <thead><tr>{header}</tr></thead>
        <tbody>{body}</tbody>
      </table>
    </div>
    """

    with out:
        clear_output(wait=True)
        if title:
            ipy_display(W.HTML(f"<h3 style='margin:8px 0;color:#e9f1ff'>{esc(title)}</h3>"))
        ipy_display(W.HTML(html))
def on_run(_):
    with out_run:
        clear_output(wait=True)

        target_id = w_ref.value
        target_country = w_target_country.value
        n_candidates = int(w_n_candidates.value)
        k_show = int(w_k_show.value)

        if not target_id:
            print("Pick a reference listing first.")
            return
        if not target_country:
            print("Pick a target country.")
            return

        # Build env weights dict EXACTLY as expected (keys should be env_* or env_*_norm)
        env_weights = {col: int(slider.value) for col, slider in env_sliders.items() if int(slider.value) != 0}

        # Params
        params = dict(
            w_price=float(w_price_w.value),
            w_property=float(w_property_w.value),
            w_host=float(w_host_w.value),
            w_temp=float(w_temp_w.value),
            w_budget=float(w_budget_w.value),
            temp_pref=float(w_temp_pref.value) if w_temp_pref.value not in (None, "") else None,
            travel_month=int(w_month.value) if w_month.value not in (None, "") else None,
            budget_pref=(w_budget_pref.value or None),
            env_weights=env_weights
        )

        print("Running recommend with:")
        print(" target_id:", target_id)
        print(" target_country:", target_country)
        print(" n_candidates:", n_candidates, "k_show:", k_show)
        print(" env keys (sample):", list(env_weights.keys())[:6])

        t0 = time.perf_counter()

        # Retrieve candidates
        cand_df = retrieve(
            target_id=target_id,
            country=target_country,
            df=df_emb,
            lsh_model=lsh_model,
            n=n_candidates
        )
        if cand_df is None:
            print("retrieve() returned None (target missing embedding?)")
            return

        cand_df = cand_df.filter(F.col("property_id") != F.lit(target_id))

        # Join with df_all (avoid dup cols)
        cand_cols = set(cand_df.columns)
        df_all_to_join = df_all.select(*[c for c in df_all.columns if (c == "property_id") or (c not in cand_cols)])
        cand_df = cand_df.join(df_all_to_join, on="property_id", how="inner")

        # Rank
        ranked = order(
            df=cand_df,
            k=k_show,
            price_w=params["w_price"],
            property_w=params["w_property"],
            host_w=params["w_host"],
            env_weights=params["env_weights"],
            temp_pref=params["temp_pref"],
            temp_w=params["w_temp"],
            travel_month=params["travel_month"],
            budget_pref=params["budget_pref"],
            budget_w=params["w_budget"],
            normalize_all_weights=True,
            score_col="final_score"
        )

        cols_show = [c for c in [
            "property_id","listing_title","addr_cc","addr_name","room_type_text",
            "price_per_night","ratings","l2_dist","final_score","final_url"
        ] if c in ranked.columns]

        final_topk = ranked.select(*cols_show).orderBy(F.desc("final_score"), F.asc_nulls_last("l2_dist")).limit(k_show)
        STATE["final_topk"] = final_topk

        t1 = time.perf_counter()
        print(f"Done in {(t1 - t0):.2f}s")

        # Show in HTML 
        show_spark_df(
            final_topk,
            n=k_show,
            title="Top-K Recommendations",
            cols=[c for c in ["listing_title","addr_cc","addr_name","room_type_text","price_per_night","ratings","final_url"] if c in cols_show]
        )

btn_run.on_click(on_run)

ipy_display(widgets.VBox([btn_run, out_run]))
out = W.Output()
ipy_display(out)