In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import time
import pandas as pd
import os

# 1001 - 1200

In [2]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_6.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_6.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 3756 link chưa crawl.

Batch 1/188 - gồm 20 link
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-vo-chi-cong-phuong-nghia-do-1/ban-toa-chung-cu-mini-50m2-x-6t-xay-moi-du-pccc-10-pkk-dong-tien-600-trieu-nam-pr43130209
OK: https://batdongsan.com.vn/ban-nha-mat-pho-duong-ho-tung-mau-phuong-mai-dich/276-tr-m2-rong-30m-102m2-mt-4-5m-no-hau-5-1m-pr42850262
OK: https://batdongsan.com.vn/ban-dat-duong-hau-ai-xa-van-canh-1/ban-ho-duc-vi-tri-dep-o-to-tranh-chi-cach-3-5m-10m-pr42868257
OK: https://batdongsan.com.vn/ban-dat-thi-tran-xuan-mai-1/hang-hot-lo-goc-hoa-hau-dau-gia-view-song-container-do-cua-tai-khu-dong-khoang-beo-mai-pr43137618
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-pham-van-dong-phuong-dong-ngac-prj-sunshine-city/chinh-chu-ban-s56-97-7m2-so-do-3pn-view-noi-khu-8-5ty-full-do-roi-pr42994888
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-pham-van-dong-phuong-dong-ngac-prj-sunshine-city/ban-goc-3-phong-ngu-mau-103-8m2-thong-thuy-co-2-ban-cong-dep-gia-7-

# 1201 - 1400

In [3]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_7.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_7.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 3856 link chưa crawl.

Batch 1/193 - gồm 20 link
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-bo-de-phuong-bo-de/ban-11-95-ty-48m2-3pn-3wc-tai-lo-goc-dau-tu-pr43125657
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-pham-hung-phuong-my-dinh-1-prj-vinhomes-skylake/minh-ban-1-ngu-dep-hiem-toa-s2-tang-trung-cao-ban-cong-dn-thoang-co-san-dong-tien-pr43100965
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-quoc-lo-6-phuong-dong-mai-1/ban-32-1m2-gan-6-gia-3ty398tr-o-to-gan-so-do-chuan-khong-quy-hoach-khong-i-phong-thuy-pr42903620
OK: https://batdongsan.com.vn/ban-dat-thi-tran-trau-quy/cc-gui-ban-230m2-mt-11-75m-biet-thu-don-lap-khu-31ha-gia-lam-gia-re-nhat-truong-pr42842272
OK: https://batdongsan.com.vn/ban-dat-duong-21a-xa-phu-man/hang-hiem-tai-quoc-oai-gia-nhinh-3-ty-60m2-full-tho-cu-mat-ql-21a-pr42643783
OK: https://batdongsan.com.vn/ban-dat-xa-thach-hoa-1/hang-dep-sieu-hiem-lo-76-1m2-tai-dinh-cu-dai-hoc-quoc-gia-re-hon-ca-phan-lo-pr42957007
OK: https://batdongsan.c

# 1401 - 1600

In [3]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_8.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_8.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Đã có 2177 link đã crawl. Sẽ skip...
Còn lại 1735 link chưa crawl.

Batch 1/87 - gồm 20 link
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-khuong-dinh-phuong-khuong-dinh/-45m-7-tang-thang-may-full-noi-that-12-ty-3-pr43116440
Lỗi với https://batdongsan.com.vn/ban-dat-duong-421b-xa-dong-yen-2/-nen-200m2-tai-quoc-oai-ha-noi-gia-16-trieu-vnd-pr43116900: Message: 
Stacktrace:
	GetHandleVerifier [0x00B6FC03+61635]
	GetHandleVerifier [0x00B6FC44+61700]
	(No symbol) [0x009905D3]
	(No symbol) [0x009D899E]
	(No symbol) [0x009D8D3B]
	(No symbol) [0x00A20E12]
	(No symbol) [0x009FD2E4]
	(No symbol) [0x00A1E61B]
	(No symbol) [0x009FD096]
	(No symbol) [0x009CC840]
	(No symbol) [0x009CD6A4]
	GetHandleVerifier [0x00DF4523+2701795]
	GetHandleVerifier [0x00DEFCA6+2683238]
	GetHandleVerifier [0x00E0A9EE+2793134]
	GetHandleVerifier [0x00B868C5+155013]
	GetHandleVerifier [0x00B8CFAD+181357]
	GetHandleVerifier [0x00B77458+92440]
	GetHandleVerifier [0x00B77600+92864]
	GetHandleVerifier [0x00B61FF0+5296]
	

# 2400 - 2600

In [4]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_13.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_13.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 3945 link chưa crawl.

Batch 1/198 - gồm 20 link
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-xa-van-canh-1-prj-moonlight-1-an-lac-green-symphony/ban-3-ngu-truc-09-tang-dep-huong-dep-di-xem-nha-ngay-chot-som-pr42978379
OK: https://batdongsan.com.vn/ban-nha-rieng-phuong-phuc-loi/ban-dep-doc-lap-2-mat-thoang-oto-do-gan-5-tang-moi-tinh-tien-ich-bao-quanh-pr43073866
OK: https://batdongsan.com.vn/ban-dat-xa-bac-son-16/chi-hon-500tr-100m-full-tho-cu-tai-soc-ha-noi-cuc-phu-hop-cho-cac-quy-nha-dau-tu-pr42938414
OK: https://batdongsan.com.vn/ban-dat-xa-trang-viet/ban-trong-de-tai-54-trieu-m2-65m2-ngo-thong-3-5m-phap-ly-day-du-pr43074026
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-quang-trung-phuong-la-khe/ban-gap-6-75-ty-37m2-o-ha-dong-ha-noi-pr43008853
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-phuong-thach-ban-1-prj-khu-nha-o-thach-ban/-74-6-m2-2-pn-2-wc-gan-aeon-long-bien-tang-cao-thoang-pr42970359
OK: https://batdongsan.com.vn/ban-nha-biet-thu-lien-ke-xa-da-ton-prj