In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import time
import pandas as pd
import os

# 1601 - 1800

In [2]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_9.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_9.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 3922 link chưa crawl.

Batch 1/197 - gồm 20 link
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-me-tri-phuong-my-dinh-1-prj-the-manor-ha-noi/cap-nhat-quy-cao-cap-dang-giao-ban-189m-193m-3pn-gia-68tr-m2-pr42129584
OK: https://batdongsan.com.vn/ban-nha-mat-pho-pho-pho-vong-phuong-bach-khoa/chinh-chu-ban-hbt-dt-140m-mt-4-2m-7-tang-thang-may-full-noi-that-via-he-rong-pr43088106
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-do-duc-duc-phuong-me-tri/chinh-chu-ban-my-dinh-dt-40m-mt-6-5m-6-thang-may-full-noi-that-cach-oto-dung-10m-pr43102984
OK: https://batdongsan.com.vn/ban-dat-duong-da-ton-xa-da-ton/chinh-chu-ban-tai-gia-lam-hn-48m2-gia-1-ty-700-trieu-pr38379666
OK: https://batdongsan.com.vn/ban-nha-biet-thu-lien-ke-duong-to-huu-phuong-van-phuc-1-prj-tsq-galaxy/chinh-chu-ban-ha-dong-101m2-6-tang-mt-6m-thang-may-truc-chinh-kinh-doanh-sieu-tot-pr43032147
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-nguyen-huy-tuong-phuong-thanh-xuan-trung-prj-imperia-garden/-gia

# 1801 - 2000

In [3]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_10.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_10.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 3957 link chưa crawl.

Batch 1/198 - gồm 20 link
OK: https://batdongsan.com.vn/ban-dat-xa-ba-trai/n-300m2-tai-vi-ha-noi-gia-tot-em-hoang-pr41264803
OK: https://batdongsan.com.vn/ban-dat-duong-21-1-xa-phu-cat-1/155m2-truc-chinh-thon-gia-sieu-re-gan-cho-truong-hoc-to-hop-y-te-pr42531721
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-phuc-loi-phuong-phuc-loi/ban-tai-ngo-321-4-38-ty-30m2-2pn-2wc-3-tang-pr43106233
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-ho-tung-mau-phuong-phu-dien-prj-goldmark-city/cc-ban-3n-2wc-93-4m2-san-so-full-do-du-an-gia-chi-6-5-ty-lh-ms-ngan-pr43113707
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-nguyen-thoi-trung-phuong-thach-ban-1/-77m2-x-6t-thang-may-oto-tranh-pho-dong-dinh-pr43085644
OK: https://batdongsan.com.vn/ban-dat-xa-ha-bang/cat-lo-sau-chi-2-67-ty-so-huu-72-3m2-full-tho-tai-hoa-lac-cam-ket-re-nt-thi-truong-pr43143465
OK: https://batdongsan.com.vn/ban-dat-duong-da-ton-xa-da-ton/chinh-chu-ban-tai-gia-lam-hn-48m2-gia-1-ty-700-t

# 2001 - 2200

In [2]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_11.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_11.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Đã có 1599 link đã crawl. Sẽ skip...
Còn lại 2323 link chưa crawl.

Batch 1/117 - gồm 20 link
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-truong-sa-xa-dong-hoi-prj-masteri-grand-avenue/tang-26-2pn-2vs-vinhomes-co-loa-huong-bac-view-song-view-biet-thu-ky-moi-cdt-pr41982269
OK: https://batdongsan.com.vn/ban-nha-rieng-pho-giang-vo-phuong-giang-vo/ban-sieu-pham-phan-lo-ba-dinh-hang-xom-vihomes-o-to-tranh-an-sinh-dinh-131m2-65-ti-pr43094942
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-hau-duong-xa-kim-chung/-dong-anh-75m2-full-tho-cu-12mt-ban-dat-tang-c4-chu-xay-kien-co-vai-buoc-pr43064631
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-an-thuong-xa-an-thuong-1/chi-con-3-4-c-lien-ke-gia-tot-nhat-hoai-duc-pr42833319
OK: https://batdongsan.com.vn/ban-nha-rieng-duong-an-thuong-xa-an-thuong-1/chinh-chu-gui-b-51m2-thuong-pr42934143
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-minh-khai-phuong-vinh-tuy-prj-vinhomes-times-city-park-hill/-1pn-53m-full-do-dep-view-mat-t

# 2200 - 2400

In [3]:
# ==== CẤU HÌNH SELENIUM ====
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")

driver = uc.Chrome(options=options, version_main=136)

# ==== ĐỌC LINK TỪ FILE CSV ====
df_links = pd.read_csv("Link\\links_bds_hanoi_12.csv")
all_links = df_links["Link"].dropna().unique().tolist()

# ==== KIỂM TRA LINK ĐÃ CRAWL RỒI ====
output_file = "Data\\batdongsan_full_data_hanoi_12.csv"
if os.path.exists(output_file):
    df_done = pd.read_csv(output_file)
    done_links = df_done["Link"].dropna().unique().tolist()
    print(f"Đã có {len(done_links)} link đã crawl. Sẽ skip...")
else:
    df_done = pd.DataFrame()
    done_links = []

# ==== CHUẨN BỊ DANH SÁCH LINK CHƯA CRAWL ====
pending_links = [link for link in all_links if link not in done_links]
print(f"Còn lại {len(pending_links)} link chưa crawl.")

# ==== CHIA NHỎ BATCH ====
BATCH_SIZE = 20
batches = [pending_links[i:i+BATCH_SIZE] for i in range(0, len(pending_links), BATCH_SIZE)]

for batch_num, batch_links in enumerate(batches, 1):
    batch_data = []

    print(f"\nBatch {batch_num}/{len(batches)} - gồm {len(batch_links)} link")
    for link in batch_links:
        try:
            driver.get(link)

            address = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address"))
            ).text

            specs_items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".re__pr-specs-content-item"))
            )

            features = {}
            for item in specs_items:
                key = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text.strip()
                value = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text.strip()
                features[key] = value

            data = {
                "Link": link,
                "Địa chỉ": address,
                **features
            }

            batch_data.append(data)
            print(f"OK: {link}")

        except Exception as e:
            print(f"Lỗi với {link}: {e}")

    # LƯU SAU MỖI BATCH
    df_batch = pd.DataFrame(batch_data)
    if not df_batch.empty:
        df_done = pd.concat([df_done, df_batch], ignore_index=True)
        df_done.drop_duplicates(subset="Link", inplace=True)
        df_done.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"Đã lưu xong batch {batch_num} ({len(df_batch)} link mới)")

driver.quit()
print(f"\nCrawl hoàn tất. Tổng cộng đã lưu {len(df_done)} bài đăng vào '{output_file}'")


Còn lại 3911 link chưa crawl.

Batch 1/196 - gồm 20 link
OK: https://batdongsan.com.vn/ban-nha-biet-thu-lien-ke-duong-ly-son-phuong-gia-thuy-prj-khai-son-city/can-ban-nh-can-pho-100m2-doi-dien-chung-cu-mt6-2m-lh-pr41271624
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-to-huu-phuong-yen-nghia-1-prj-xuan-mai-complex/ban-hh2f-73m2-sdcc-view-tang-o-ngay-pr42993619
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-duong-to-huu-phuong-trung-van-prj-ecolife-capitol/chinh-chu-ban-goc-3-ngu-capil-gia-6-ty-dt-111m2-day-du-noi-that-pr41204675
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-phuong-thach-ban-1-prj-khu-nha-o-thach-ban/-74-6-m2-2-pn-2-wc-gan-aeon-long-bien-tang-cao-thoang-pr42970359
OK: https://batdongsan.com.vn/ban-nha-biet-thu-lien-ke-xa-da-ton-prj-vinhomes-ocean-park-gia-lam/re-nhat-18-ty-ngoc-trai-dao-lon-gan-goc-70-8m2-tay-bac-lam-pr42217191
OK: https://batdongsan.com.vn/ban-can-ho-chung-cu-phuong-long-bien-prj-northern-diamond/ban-tai-100m2-co-slot-oto-pr42228541
OK