In [4]:
# 1) Crawl ONLY href links of homes from ONE listing page (/pX) -> save CSV



from pathlib import Path
import os
import sys
import importlib
import concurrent.futures
import pandas as pd

project_root = Path.cwd()
if not (project_root / "src").exists():
    project_root = project_root.parent

if (project_root / "src").exists():
    sys.path.append(str(project_root))
else:
    raise FileNotFoundError("Khong tim thay thu muc src; hay mo notebook tu project root.")

import src.scrapers.batdongsan_scraper as bds
importlib.reload(bds)

def _collect_hrefs_one_page_sync(listing_url: str, *, cookies=None, headless=True, timeout=45, max_links=None):
    bds._ensure_windows_proactor_policy()
    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=headless)
        context = browser.new_context(user_agent=bds.DEFAULT_HEADERS["User-Agent"])
        extra_headers = {"Referer": bds.BASE_URL}
        cookie_header = cookies or os.getenv("BDS_COOKIE")
        if cookie_header:
            extra_headers["Cookie"] = cookie_header
        context.set_extra_http_headers(extra_headers)
        page = context.new_page()

        page.goto(listing_url, wait_until="domcontentloaded", timeout=timeout * 1000)
        try:
            page.wait_for_selector("a.js__product-link-for-product-id", timeout=timeout * 1000)
        except PlaywrightTimeoutError:
            pass

        # Extract hrefs directly from DOM
        try:
            hrefs = page.eval_on_selector_all(
            "a.js__product-link-for-product-id",
            "els => els.map(e => e.getAttribute('href') || e.href).filter(Boolean)",
            )
        except Exception:
            hrefs = []

        links = []
        for href in hrefs or []:
            href = str(href)
            if href.startswith("/"):
                href = bds.BASE_URL.rstrip("/") + href
            if href.startswith("http"):
                links.append(href)

        # unique + optional limit
        links = list(dict.fromkeys(links))
        if max_links:
            links = links[:max_links]

        context.close()
        browser.close()
        return links

def collect_hrefs_one_page_threaded(listing_url: str, *, cookies=None, headless=True, timeout=45, max_links=None):
    # Keep the same pattern: run Sync API in a background thread for notebooks
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
        fut = ex.submit(
            _collect_hrefs_one_page_sync,
            listing_url,
            cookies=cookies,
            headless=headless,
            timeout=timeout,
            max_links=max_links,
        )
        return fut.result()

# === Config: change page here and run cell again for each /pX ===
pages = list(range(2, 501))
for page in pages:
    listing_url = f"https://batdongsan.com.vn/nha-dat-ban-tp-hcm/p{page}"
    headless = True  # set False if you need manual verification
    max_links = None  # e.g. 50
    cookie = os.getenv("BDS_COOKIE")

    detail_urls = collect_hrefs_one_page_threaded(
        listing_url,
        cookies=cookie,
        headless=headless,
        timeout=45,
        max_links=max_links,
    )

    links_df = pd.DataFrame({"detail_url": detail_urls})
    out_links = project_root / "data" / "processed" / f"links_p{page}.csv"
    links_df.to_csv(out_links, index=False, encoding="utf-8-sig")
    print(f"Saved links: {out_links} ({len(links_df)} links)")

Saved links: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\processed\links_p2.csv (30 links)
Saved links: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\processed\links_p3.csv (30 links)
Saved links: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\processed\links_p4.csv (30 links)
Saved links: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\processed\links_p5.csv (30 links)
Saved links: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\processed\links_p6.csv (30 links)
Saved links: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\processed\links_p7.csv (30 links)
Saved links: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\processed\li

In [7]:
# 2) Read links_pX.csv then parse all them --> output == scraped_results_pX.csv

from pathlib import Path
import os
import sys
import re
import concurrent.futures
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup

project_root = Path.cwd()
if not (project_root / "src").exists():
    project_root = project_root.parent

if (project_root / "src").exists():
    sys.path.append(str(project_root))
else:
    raise FileNotFoundError("Khong tim thay thu muc src; hay mo notebook tu project root.")

import src.scrapers.batdongsan_scraper as bds




# ----------------- Text / numeric / date normalizers -----------------

def _clean_text(s):
    if s is None:
        return None
    s = str(s).replace("\xa0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s or None


def _norm_key(s: str) -> str:
    s = _clean_text(s) or ""
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.lower()
    s = s.replace("đ", "d")
    s = re.sub(r"\s+", " ", s).strip()
    return s


def _to_float_number(s):
    if s is None:
        return None
    s = str(s).strip().lower()
    if not s:
        return None
    m = re.search(r"(\d+(?:[\.,]\d+)?)", s)
    if not m:
        return None
    try:
        return float(m.group(1).replace(",", "."))
    except Exception:
        return None


def _to_int(s: str | None):
    if not s:
        return None
    m = re.search(r"(\d+)", str(s))
    return int(m.group(1)) if m else None


def _parse_percent(s: str | None):
    if s is None:
        return None
    s = _clean_text(s)
    if not s:
        return None
    m = re.search(r"(\d+(?:[\.,]\d+)?)", s)
    if not m:
        return None
    try:
        return float(m.group(1).replace(",", "."))
    except Exception:
        return None


def _parse_price_to_billion(price_text: str | None):
    # Output in "tỷ" (billions). Handles:
    # - "4,5 tỷ" -> 4.5
    # - "450 triệu" -> 0.45
    # - "10 tỷ 500 triệu" -> 10.5
    if not price_text:
        return None

    t_raw = str(price_text)
    t = _norm_key(t_raw)
    if not t:
        return None

    if any(k in t for k in ["thoa thuan", "thuong luong"]):
        return None

    total_ty = 0.0
    found_any = False

    # Keep unit with its nearby number by scanning tokens
    tokens = re.split(r"\s+", t_raw)
    for i, tok in enumerate(tokens):
        tok_norm = _norm_key(tok)
        val = _to_float_number(tok_norm)
        if val is None:
            continue

        # lookahead for unit token
        unit = ""
        if i + 1 < len(tokens):
            unit = _norm_key(tokens[i + 1])

        # also allow unit in same token
        if "tỷ" in tok or "ty" in tok_norm or "tỷ" in unit or "ty" in unit:
            total_ty += val
            found_any = True
            continue
        if "triệu" in tok or "trieu" in tok_norm or "triệu" in unit or "trieu" in unit:
            total_ty += val / 1000.0
            found_any = True
            continue
        if any(u in tok_norm for u in ["nghin", "ngan"]) or any(u in unit for u in ["nghin", "ngan"]):
            total_ty += val / 1_000_000.0
            found_any = True
            continue

    return round(total_ty, 6) if found_any else None


def _flag_has_furnishing(furnishing_text: str | None) -> int:
    t = _norm_key(furnishing_text or "")
    if not t:
        return 0
    if any(k in t for k in ["khong", "chua", "trong", "ban giao tho", "khong noi that"]):
        return 0
    return 1


def _flag_has_red_book(legal_text: str | None) -> int:
    t = _norm_key(legal_text or "")
    if not t:
        return 0
    return 1 if ("so hong" in t or "so do" in t) else 0


def _as_none_if_empty(v: str | None):
    v = _clean_text(v)
    if not v:
        return None
    if _norm_key(v) in {"-", "--", "khong", "kxd", "khong co", "na"}:
        return None
    return v


def _pick_first(*values):
    for v in values:
        v = _as_none_if_empty(v)
        if v is not None:
            return v
    return None


# ----------------- DOM extractors -----------------

def _extract_kv_items(soup: BeautifulSoup):
    # Spec section: "Đặc điểm bất động sản"
    out: dict[str, str] = {}
    for item in soup.select(".re__pr-specs-content-item") or []:
        k_el = item.select_one(".re__pr-specs-content-item-title")
        v_el = item.select_one(".re__pr-specs-content-item-value")
        k = _clean_text(k_el.get_text(" ", strip=True) if k_el else None)
        v = _clean_text(v_el.get_text(" ", strip=True) if v_el else None)
        if k and v:
            out[_norm_key(k)] = v
    return out


def _extract_short_info(soup: BeautifulSoup):
    # Usually shows: Ngày đăng / Ngày hết hạn / Loại tin / Mã tin
    out: dict[str, str] = {}
    for item in soup.select(".re__pr-short-info-item") or []:
        parts = [p.strip() for p in item.stripped_strings if p and p.strip()]
        if len(parts) >= 2:
            out[_norm_key(parts[0])] = parts[1]
    return out


def _parse_address_parts(address: str | None):
    # Robust Vietnamese address parsing for cases like:
    # "Đường Lê Tự Tài, Phường 4, Quận Phú Nhuận, Hồ Chí Minh"
    if not address:
        return None, None, None, None

    parts = [p.strip() for p in str(address).split(",") if p.strip()]
    street = None
    ward = None
    district = None
    city = None

    for p in parts:
        pn = _norm_key(p)
        if any(k in pn for k in ["thanh pho", "tp", "tinh", "ho chi minh", "ha noi", "da nang"]):
            city = p
        elif "phuong" in pn or pn.startswith("p "):
            ward = p
        elif any(k in pn for k in ["quan", "huyen", "thi xa", "tp thu duc"]):
            district = p
        else:
            # first non-admin part is street-ish
            if street is None:
                street = p

    # fallback to module heuristic if missing pieces
    if any(v is None for v in [street, ward, district, city]):
        st2, w2, d2, c2 = bds.parse_location_vn(address)
        street = street or st2
        ward = ward or w2
        district = district or d2
        city = city or c2

    return _clean_text(street), _clean_text(ward), _clean_text(district), _clean_text(city)


def _has_verified_badge(html: str, full_text_norm: str) -> int:
    # Requirement: if the page contains the phrase "Batdongsan.com.vn đã xác thực" -> 1 else 0
    # In practice it may appear in attributes (alt/title/aria-label), so scan BOTH text and raw HTML (normalized).
    html_norm = _norm_key(html or "")
    # flexible match: "batdongsan.com.vn" ... "da xac thuc" (allow some chars in between)
    pat = r"batdongsan\.com\.vn.{0,80}da xac thuc"
    if re.search(pat, full_text_norm):
        return 1
    if re.search(pat, html_norm):
        return 1
    # fallback: sometimes missing dot in domain or extra spaces
    if ("batdongsan com vn" in html_norm or "batdongsan com vn" in full_text_norm) and ("da xac thuc" in html_norm or "da xac thuc" in full_text_norm):
        return 1
    return 0


def parse_detail_html(html: str, url: str):
    soup = BeautifulSoup(html or "", "html.parser")

    page_title = soup.title.get_text(strip=True) if soup.title else ""
    full_text = " ".join(soup.stripped_strings)
    full_text_norm = _norm_key(full_text)
    blocked = ("just a moment" in (page_title or "").lower()) or ("cloudflare" in (full_text or "").lower())

    # Address
    address_el = soup.select_one(".re__pr-short-description.js__pr-address")
    address = _clean_text(address_el.get_text(" ", strip=True) if address_el else None)
    street, ward, district, city = _parse_address_parts(address)

    specs = _extract_kv_items(soup)
    short_info = _extract_short_info(soup)

    # Values (try multiple label variants)
    id_value = _pick_first(short_info.get(_norm_key("Mã tin")), specs.get(_norm_key("Mã tin")))

    price_text = _pick_first(
        specs.get(_norm_key("Mức giá")),
        specs.get(_norm_key("Khoảng giá")),
        specs.get(_norm_key("Giá")),
    )
    area_text = _pick_first(specs.get(_norm_key("Diện tích")))

    bedrooms_text = _pick_first(specs.get(_norm_key("Số phòng ngủ")), specs.get(_norm_key("Phòng ngủ")))
    bathrooms_text = _pick_first(
        specs.get(_norm_key("Số phòng tắm, vệ sinh")),
        specs.get(_norm_key("Số phòng vệ sinh")),
        specs.get(_norm_key("Số phòng tắm")),
    )
    floors_text = _pick_first(specs.get(_norm_key("Số tầng")))

    furnishing_text = _pick_first(specs.get(_norm_key("Nội thất")))
    facade_text = _pick_first(specs.get(_norm_key("Mặt tiền")))
    access_road_text = _pick_first(specs.get(_norm_key("Đường vào")))
    direction_text = _pick_first(specs.get(_norm_key("Hướng nhà")))

    legal_text = _pick_first(specs.get(_norm_key("Pháp lý")))

    posted_date_text = _pick_first(short_info.get(_norm_key("Ngày đăng")), specs.get(_norm_key("Ngày đăng")))
    expiry_date_text = _pick_first(short_info.get(_norm_key("Ngày hết hạn")), specs.get(_norm_key("Ngày hết hạn")))
    listing_type_text = _pick_first(short_info.get(_norm_key("Loại tin")), specs.get(_norm_key("Loại tin")))

    verified = _has_verified_badge(html, full_text_norm)

    # Price increase in last year (percent)
    price_increase_el = soup.select_one(".cta-number")
    price_increase_pct = _parse_percent(price_increase_el.get_text(" ", strip=True) if price_increase_el else None)

    # Final row (exact columns requested)
    row = {
        "ID": _clean_text(id_value),
        "Giá (tỷ đồng)": _parse_price_to_billion(price_text),
        "Giá đã tăng 1 năm qua (%)": price_increase_pct,
        "Diện tích": _to_float_number(area_text),
        "Số phòng tắm": _to_int(bathrooms_text),
        "Số phòng ngủ": _to_int(bedrooms_text),
        "Số tầng": _to_int(floors_text),
        "Có nội thất": int(_flag_has_furnishing(furnishing_text)),
        "Mặt tiền(m)": _to_float_number(facade_text),
        "Đường vào(m)": _to_float_number(access_road_text),
        "Hướng nhà": _clean_text(direction_text),
        "Đường": _clean_text(street),
        "Phường": _clean_text(ward),
        "Quận": _clean_text(district),
        "Thành Phố": _clean_text(city),
        "Có sổ hồng": int(_flag_has_red_book(legal_text)),
        "Có xác thực": int(verified),
        "Ngày đăng": pd.to_datetime(posted_date_text, dayfirst=True, errors="coerce"),
        "Ngày hết hạn": pd.to_datetime(expiry_date_text, dayfirst=True, errors="coerce"),
        "Loại Tin": _clean_text(listing_type_text),
        "ngày crawl": pd.Timestamp.now(),
        # debug extras (kept for your sanity)
        "blocked": bool(blocked),
        "source_url": url,
        "raw_price": _clean_text(price_text),
        "raw_area": _clean_text(area_text),
        "legal": _clean_text(legal_text),
        "address": _clean_text(address),
        "page_title": _clean_text(page_title),
    }

    # hard normalize: make sure text columns are stripped
    for k in [
        "ID",
        "Hướng nhà",
        "Đường",
        "Phường",
        "Quận",
        "Thành Phố",
        "Loại Tin",
    ]:
        row[k] = _clean_text(row.get(k))

    return row


# ----------------- Playwright fetch (threaded) -----------------

def _fetch_html_sync(url: str, *, cookies=None, headless=True, timeout=45):
    bds._ensure_windows_proactor_policy()
    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=headless)
        context = browser.new_context(user_agent=bds.DEFAULT_HEADERS["User-Agent"])
        extra_headers = {"Referer": bds.BASE_URL}
        cookie_header = cookies or os.getenv("BDS_COOKIE")
        if cookie_header:
            extra_headers["Cookie"] = cookie_header
        context.set_extra_http_headers(extra_headers)

        page = context.new_page()
        page.goto(url, wait_until="domcontentloaded", timeout=timeout * 1000)
        try:
            page.wait_for_selector(".re__pr-specs", timeout=8000)
        except PlaywrightTimeoutError:
            pass
        # verified badge block may render after specs; wait briefly if present
        try:
            page.wait_for_selector(".re__pr-listing-verified-section", timeout=3000)
        except PlaywrightTimeoutError:
            pass

        html = page.content()
        context.close()
        browser.close()
        return html


def fetch_html_threaded(url: str, *, cookies=None, headless=True, timeout=45):
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
        fut = ex.submit(_fetch_html_sync, url, cookies=cookies, headless=headless, timeout=timeout)
        return fut.result()




Xs = list(range(2, 501))

for X in Xs:
    links_path = project_root / "data" / "processed" / f"links_p{X}.csv"
    links_df = pd.read_csv(links_path)
    detail_urls = links_df["detail_url"].dropna().astype(str).tolist()
    print("Loaded:", len(detail_urls), "urls")



    # ----------------- PHẦN THAY THẾ: CHẠY TOÀN BỘ VÀ LƯU FILE -----------------

    all_results = []  # Danh sách để gom tất cả các dòng dữ liệu

    for i, url in enumerate(detail_urls):
        try:
            # 1. Lấy HTML
            html = fetch_html_threaded(url, cookies=os.getenv("BDS_COOKIE"), headless=True, timeout=45)
            
            # 2. Parse dữ liệu thành dictionary
            row = parse_detail_html(html, url)
            
            # 3. Gom vào list tổng
            all_results.append(row)
            
        except Exception as e:
            print(f"Lỗi tại URL {url}: {e}")
            continue

    # Kiểm tra nếu có dữ liệu thì mới tạo DataFrame và lưu
    if all_results:
        # Tạo DataFrame duy nhất từ danh sách kết quả
        df_final = pd.DataFrame(all_results)

        # Các cột cần xuất hiện trong file CSV theo đúng yêu cầu
        main_cols = [
            "ID", "Giá (tỷ đồng)", "Giá đã tăng 1 năm qua (%)", "Diện tích", "Số phòng tắm", "Số phòng ngủ", 
            "Số tầng", "Có nội thất", "Mặt tiền(m)", "Đường vào(m)", "Hướng nhà", 
            "Đường", "Phường", "Quận", "Thành Phố", "Có sổ hồng", "Có xác thực", 
            "Ngày đăng", "Ngày hết hạn", "Loại Tin", "ngày crawl"
        ]
        
        # Cột debug để bạn kiểm tra lỗi nếu cần
        dbg_cols = ["blocked", "source_url", "address", "legal", "raw_price", "raw_area", "page_title"]
        
        # Sắp xếp lại thứ tự cột
        df_final = df_final[main_cols + dbg_cols]

        # Đường dẫn lưu file (lưu vào thư mục data/raw/scraped)
        output_path = project_root / "data" / "raw" / "scraped" / f"scraped_results_p{X}.csv"
        
        # Lưu file (sử dụng utf-8-sig để Excel không lỗi font tiếng Việt)
        df_final.to_csv(output_path, index=False, encoding="utf-8-sig")
        
        print("-" * 30)
        print(f"Xong! Đã lưu {len(df_final)} dòng vào: {output_path}")
    else:
        print("Không có dữ liệu nào được thu thập.")

Loaded: 30 urls
------------------------------
Xong! Đã lưu 30 dòng vào: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\raw\scraped\scraped_results_p2.csv
Loaded: 30 urls
------------------------------
Xong! Đã lưu 30 dòng vào: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\raw\scraped\scraped_results_p3.csv
Loaded: 30 urls
------------------------------
Xong! Đã lưu 30 dòng vào: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\raw\scraped\scraped_results_p4.csv
Loaded: 30 urls
------------------------------
Xong! Đã lưu 30 dòng vào: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\raw\scraped\scraped_results_p5.csv
Loaded: 30 urls
------------------------------
Xong! Đã lưu 30 dòng vào: d:\Vscode\Self\Data Science\Book Notes\Data Visualization\Lab\Lab1\Lab1---Data-Visualization-\data\

KeyboardInterrupt: 