## Data Crawl từ batdongsan.com

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import time
import pandas as pd
import os

# 1 - 200

In [15]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_1.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_1.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Đã có 40 link đã crawl. Sẽ skip...
Còn lại 3763 link chưa crawl.

Batch 1/189 - gồm 20 link
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-xa-duong-xa-prj-vinhomes-ocean-park-gia-lam/quy-1-ngu-2-ngu-chinh-chu-view-dep-re-phap-ly-ro-rang-tro-vay-80-pr42647132
OK: https://batdongsan.com.vn/ban-nha-rieng-phuong-khuong-mai/thoi-diem-vang-dau-tu-bds-dong-tien-toa-115m2-8-tang-voi-31p-cho-thue-200tr-thang-nga-tu-so-pr42682851
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-phuong-dong-ngac-prj-sunshine-city/chi-mot-duy-nhat-s5-2pn-1-view-cuc-pham-tang-cao-hiem-co-pr43141310
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-le-quang-dao-phuong-me-tri-prj-the-matrix-one/mo-ban-gd2-my-dinh-quy-dep-chiet-khau-cao-htls-0-pr42333189
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-trich-sai-phuong-buoi/chinh-chu-can-ban-214m-dat-kem-toa-5-tang-pr42823247
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-thi-tran-trau-quy-prj-masteri-lakeside/-2n-3n-view-re-dep-nhat-truong-ban-giao-xin-

# 201 - 400

In [16]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_2.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_2.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Đã có 29 link đã crawl. Sẽ skip...
Còn lại 3824 link chưa crawl.

Batch 1/192 - gồm 20 link
OK: https://batdongsan.com.vn/ban-nha-rieng-pho-van-phuc-phuong-van-phuc-1/ban-gap-nr-16-9-ty-62m2-5pn-5wc-oto-tranh-vao-lam-viec-chinh-chu-ha-dong-pr42930993
OK: https://batdongsan.com.vn/ban-dat-duong-lac-long-quan-phuong-nghia-do-1/-lo-goc-2-mat-tien-ngo-o-to-tranh-kinh-doanh-thong-cac-nga-gan-ho-tay-pr43097167
OK: https://batdongsan.com.vn/ban-dat-xa-dong-my_1/ban-my-thanh-tri-60m2-mat-duong-thong-kinh-doanh-o-to-tranh-gia-chi-hon-5-ty-pr43135978
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-pho-lang-ha-phuong-lang-ha-prj-green-diamond-93-lang-ha/nhuong-2-ngu-91m-nguyen-ban-chua-chinh-sua-moi-100-du-an-diamond-pr41425463
OK: https://batdongsan.com.vn/ban-dat-duong-minh-khai-phuong-minh-khai/-hiem-mt-6m-ngo-o-to-tai-tranh-kinh-doanh-3-buoc-ra-mp-nhinh-9ty-pr43069309
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-duong-van-be-phuong-vinh-tuy-prj-sunshine-garden/-cao-cap-107m-3pn-n

# 401 - 600

In [3]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_3.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_3.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 3813 link chưa crawl.

Batch 1/191 - gồm 20 link
OK: https://batdongsan.com.vn/ban-nha-rieng-pho-tran-cung-phuong-nghia-tan-3/ban-phan-lo-17-5-ty-55m-7-tang-mt-3-5m-thang-may-via-he-o-to-h-cau-giay-pr43127615
OK: https://batdongsan.com.vn/ban-nha-rieng-phuong-xuan-la/ban-giap-ngoai-giao-doan-tay-ho-dien-tich-65m2-6-tang-mt-5m-gia-chi-15-ty-pr43139550
OK: https://batdongsan.com.vn/ban-nha-rieng-phuong-duong-noi/-view-cong-vien-50m-kinh-doanh-van-phong-gia-mem-a-pr42076183
OK: https://batdongsan.com.vn/ban-nha-mat-pho-pho-kham-thien-phuong-tho-quan/ban-dong-da-74m2-23-8-ty-cuc-dep-san-cho-thue-cua-hang-co-dong-tien-san-pr42355003
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-nguyen-chi-thanh-phuong-lang-ha/ban-5-tang-phan-lo-pho-42m2-18-2-ty-pr43018984
OK: https://batdongsan.com.vn/ban-dat-xa-song-phuong-1/-tho-cu-sat-dai-lo-thang-long-duong-truoc-rong-6m-pr43013077
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-pho-lang-ha-phuong-lang-ha-prj-green-diamond-93-lang-ha/nhuon

# 601 - 800

In [4]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_4.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_4.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 3611 link chưa crawl.

Batch 1/181 - gồm 20 link
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-pham-van-bach-phuong-yen-hoa-2-prj-golden-park-tower/-ban-goc-hau-hiem-115m2-3pn-tang-trung-view-tng-lh-pr43003553
OK: https://batdongsan.com.vn/ban-nha-biet-thu-lien-ke-duong-le-trong-tan-phuong-phu-la-prj-khu-do-thi-moi-van-phu/chinh-chu-ban-lk-ha-dong-90m2-thang-may-vi-tri-dac-dia-2-thoang-nhinh-20-ty-pr41819564
OK: https://batdongsan.com.vn/ban-dat-xa-nguyen-khe/kinh-doanh-duong-6m-oto-tranh-hai-mat-duong-chia-2-lo-duoc-30m-ra-san-bong-va-san-choi-rong-rai-pr43124821
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-pho-hoang-dao-thuy-phuong-trung-hoa-4-prj-khu-do-thi-trung-hoa-nhan-chinh/ban-gap-cc-3pn-2wc-146m2-tai-34t-cau-giay-ha-noi-gia-9-ty-vnd-pr42434906
OK: https://batdongsan.com.vn/ban-nha-rieng-phuong-xuan-tao/chinh-chu-can-ban-phan-lo-55-60m2-ngo-o-to-tranh-dung-do-thoai-mai-pr43050813
OK: https://batdongsan.com.vn/ban-nha-biet-thu-lien-ke-phuong-phu-thuong-1-p

# 801 - 1000

In [2]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_5.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_5.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Đã có 2999 link đã crawl. Sẽ skip...
Còn lại 761 link chưa crawl.

Batch 1/39 - gồm 20 link
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-doi-can-phuong-doi-can/sieu-pham-ho-khau-lieu-giai-ba-dinh-7-5-ty-35m-moi-o-ngay-pr43135867
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-kim-nguu-phuong-thanh-luong/ban-mat-pho-80m2-370-trieu-tai-hai-ba-trung-ha-noi-pr43115000
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-giai-phong-phuong-phuong-liet-prj-imperial-plaza/ban-gap-dep-nhat-toa-3pn-95m2-toa-ip3-lh-0902030906-pr30949849
OK: https://batdongsan.com.vn/ban-nha-mat-pho-duong-tan-thuy-phuong-phuc-dong/ban-long-bien-ha-noi-35-ty-90-6m2-chinh-chu-uy-tin-pr43130867
OK: https://batdongsan.com.vn/ban-nha-mat-pho-duong-tay-ho-phuong-quang-an-2/ban-biet-thu-san-vuon-dang-cap-240m2-x-5-tang-mt-8-8m-gia-88-ty-truoc-13m-pr43131007
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-kim-giang-phuong-thanh-xuan-bac/ban-5-tang-36m-tai-gan-o-to-ngo-thong-gia-chi-6-6-ty-pr43075169
OK: https:

# 1001 - 1200

In [None]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_6.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_6.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


# 1201 - 1400

In [None]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_7.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_7.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


# 1401 - 1600

In [None]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_8.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_8.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


# 1601 - 1800

In [None]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_9.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_9.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


# 1801 - 2000

In [None]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_10.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_10.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


# 2001 - 2200

In [None]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_11.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_11.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


# 2200 - 2400

In [None]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_12.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_12.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


# 2400 - 2600

In [None]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_13.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_13.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


# 2601 - 2800

In [3]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_14.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_14.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 3958 link chưa crawl.

Batch 1/198 - gồm 20 link
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-thien-hien-phuong-my-dinh-1/-60m-x-4t-13-x-ty-mat-ngo-thong-ngay-sat-mat-pho-lon-vi-tri-dac-dia-dep-o-nga-pr42594566
OK: https://batdongsan.com.vn/ban-nha-biet-thu-lien-ke-phuong-phuc-la-prj-khu-do-thi-moi-van-quan/-can-goc-view-ho-o-sieu-vip-kinh-doanh-sieu-dinh-230m2-mat-tien-30m-gia-88-79-ty-pr42021187
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-huynh-thuc-khang-phuong-lang-ha/ban-gap-tai-o-to-tranh-7-tang-thang-may-kd-dinh-pr42324451
OK: https://batdongsan.com.vn/ban-nha-rieng-phuong-nghia-do-1/ban-nr-tai-vo-chi-cong-7-98-ty-54m2-6pn-3wc-chinh-chu-bao-dep-pr43139482
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-vu-pham-ham-phuong-yen-hoa-2-prj-park-view-city/chinh-chu-goc-4pn-vip-e4-full-noi-that-tang-trung-ban-gap-pr42972089
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-bac-cau-phuong-ngoc-thuy/ban-tai-5-25-ty-44-m2-4pn-4wc-4-tang-pr43032421
OK: https://ba

# 2801 - 2900

In [4]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_15.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_15.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 1980 link chưa crawl.

Batch 1/99 - gồm 20 link
OK: https://batdongsan.com.vn/ban-loai-bat-dong-san-khac-phuong-hoang-liet/ban-kiot-so-4-toa-nha-n05-ban-dao-linh-dam-mai-ha-noi-ngay-mat-duong-pr43046476
OK: https://batdongsan.com.vn/ban-nha-mat-pho-duong-chinh-trung-thi-tran-trau-quy/ban-137m2-o-to-tranh-kinh-doanh-hoac-o-cuc-dinh-gia-thuong-luong-pr42794714
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-lac-long-quan-phuong-xuan-la/ban-chdv-70m2-22-phong-30m-ra-o-to-tranh-pccc-dat-chuan-dong-tien-lon-pr43045574
OK: https://batdongsan.com.vn/ban-nha-biet-thu-lien-ke-duong-sao-bien-12-xa-duong-xa-prj-vinhomes-ocean-park-gia-lam/song-lap-view-ho-phan-khu-chi-27-ty-duy-nhat-tai-lam-pr42990354
OK: https://batdongsan.com.vn/ban-nha-biet-thu-lien-ke-duong-ngoc-trai-8-xa-da-ton-prj-vinhomes-ocean-park-gia-lam/chi-21-ty-so-huu-xe-khe-08-tai-1-phan-khu-vip-nhat-du-an-pr43094401
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-nguyen-chi-thanh-phuong-lang-thuong/ban-7-tang-pho-dong-d