In [4]:
from bs4 import BeautifulSoup
import pandas as pd
import re, json

html_path = "source_madrid.html"  # your saved file

with open(html_path, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

cards = soup.select("[data-testid='property-card']")
rows = []

def clean_text(x):
    return re.sub(r"\s+", " ", x).strip() if x else None

def parse_price_block(txt):
    if not txt: return (None, None, None)
    t = txt.replace(",", "")
    m = re.search(r"([€$£]|[A-Z]{3})\s*([\d]+(?:\.\d+)?)", t)
    currency = m.group(1) if m else None
    price = float(m.group(2)) if m else None
    incl = "includes" in t.lower() or "taxes included" in t.lower()
    excl = ("+ " in t) or ("excludes" in t.lower()) or ("plus" in t.lower())
    tax_note = "included" if incl else ("excluded/plus" if excl else None)
    return (currency, price, tax_note)

for c in cards:
    # title / link
    title_el = c.select_one("[data-testid='title']")
    name = clean_text(title_el.text) if title_el else None
    link_el = c.select_one("a[data-testid='title-link'], a[data-testid='availability-cta']") or c.select_one("a")
    url = link_el.get("href") if link_el and link_el.has_attr("href") else None

    # price
    price_el = c.select_one("[data-testid='price-and-discounted-price'], [data-testid='price']")
    price_raw = clean_text(price_el.text) if price_el else None
    currency, price, tax_note = parse_price_block(price_raw)

    # rating & review count
    score_el = c.select_one("[data-testid='review-score']")
    score_txt = clean_text(score_el.text) if score_el else None
    score_word = None; score_num=None; review_count=None
    if score_txt:
        mnum = re.search(r"(\d+(?:\.\d)?)", score_txt)
        score_num = float(mnum.group(1)) if mnum else None
        mword = re.search(r"^(Excellent|Wonderful|Very good|Good|Pleasant|Fair|Poor)", score_txt, re.I)
        score_word = mword.group(1) if mword else None
        mrev = re.search(r"([\d,\.]+)\s+reviews", score_txt, re.I)
        if mrev:
            review_count = int(re.sub(r"[^\d]", "", mrev.group(1)))

    # stars
    star_el = c.select_one("[data-testid='rating-stars'] [aria-label*='stars'], [aria-label*='stars']")
    stars = None
    if star_el and star_el.get("aria-label"):
        mstars = re.search(r"(\d+)\s*stars?", star_el.get("aria-label"), re.I)
        stars = int(mstars.group(1)) if mstars else None

    # distance / location blurb
    dist_el = c.select_one("[data-testid='distance'], [data-testid='location']")
    distance_blurb = clean_text(dist_el.text) if dist_el else None

    # ✅ address (new)
    addr_el = c.select_one("[data-testid='address']")
    address = clean_text(addr_el.text) if addr_el else None
    # fallback: sometimes address text appears under the location block
    if not address:
        addr_fallback = c.select_one("[data-testid='location']")
        address = clean_text(addr_fallback.text) if addr_fallback else None

    # badges/tags
    tag_els = c.select("[data-testid='facility-badge'], [data-testid='property-highlights'] span")
    tags = [clean_text(t.text) for t in tag_els if clean_text(t.text)]

    rows.append({
        "name": name,
        "address": address,              # <- added
        "detail_url": url,
        "currency": currency,
        "display_price": price,
        "tax_note": tax_note,
        "score_word": score_word,
        "score_numeric": score_num,      # this is your numeric rating
        "review_count": review_count,
        "stars": stars,
        "distance_blurb": distance_blurb,
        "breakfast_included": any("breakfast" in t.lower() for t in (tags or [])),
        "free_cancellation": any("free cancellation" in t.lower() for t in (tags or [])),
        "pay_at_property": any(("pay at the property" in t.lower()) or ("pay later" in t.lower()) for t in (tags or [])),
        "sustainability_badge": any("sustainable" in t.lower() for t in (tags or [])),
        "raw_tags": ", ".join(tags) if tags else None,
        "raw_price_text": price_raw
    })

df_snapshot = pd.DataFrame(rows)
df_snapshot.to_csv("snapshot_enriched.csv", index=False)

In [5]:
df_snapshot.head()

Unnamed: 0,name,address,detail_url,currency,display_price,tax_note,score_word,score_numeric,review_count,stars,distance_blurb,breakfast_included,free_cancellation,pay_at_property,sustainability_badge,raw_tags,raw_price_text
0,Hotel 4C Puerta Europa,"Tetuan, Madrid",https://www.booking.com/hotel/es/4c-puerta-eur...,$,194.0,,,8.0,2529,,3.4 miles from downtown,False,False,False,False,,$194
1,Hostal Esparteros,"Madrid City Center, Madrid",https://www.booking.com/hotel/es/hostal-espart...,$,190.0,,,8.3,3230,,0.1 miles from downtown,False,False,False,False,,$190
2,Erase un Hotel,"Tetuan, Madrid",https://www.booking.com/hotel/es/eraseunhotel....,$,181.0,,,8.5,6308,,3.2 miles from downtown,False,False,False,False,,$181
3,Live It Madrid Chamberi,"Chamberi, Madrid",https://www.booking.com/hotel/es/live-it-chamb...,$,71.0,,,6.4,583,,1.1 miles from downtown,False,False,False,False,,$71
4,Ilunion Atrium,"Ciudad Lineal, Madrid",https://www.booking.com/hotel/es/confortel-atr...,$,167.0,,,8.5,7244,,3.3 miles from downtown,False,False,False,False,,$167


In [3]:
# # View all accessible data
# print(cards[0].get_text(separator="\n", strip=True))