In [None]:
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd

# Declare browser
chromedriver_path = "chromedriver.exe"
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service)

# Open URL
driver.get("https://batdongsan.com.vn/ban-can-ho-chung-cu-ha-noi?cIds=362&tpl=list")
driver.implicitly_wait(10)

# ============================ Pagination Function ============================
def go_to_next_page(driver, count, max_page):
    try:
        # Check if there are still pages to crawl (count <= max_page)
        if count <= max_page:
            # Create the next page link using the current count (page number)
            link_next_page = f"https://batdongsan.com.vn/ban-can-ho-chung-cu-ha-noi/p{count}?cIds=362&tpl=list"

            # Navigate to the next page
            driver.get(link_next_page)
            driver.implicitly_wait(10)
            return True  # Successfully navigated to the next page
        else:
            print("No more pages left to crawl!")
            return False  # No more pages to crawl, break the loop
    except Exception as e:
        print(f"Error navigating to next page: {e}")
        return False  # If there is an error, stop crawling

id = []
links = []
title1 = []

count = 1
max_pages = 2430

while count <= max_pages:
    try:
        print("Crawl Page " + str(count))
        # ================================ GET id/links/title1
        elems_id_links_title1 = driver.find_elements(By.CSS_SELECTOR, ".js__product-link-for-product-id")
        id += [elem.get_attribute('data-product-id') for elem in elems_id_links_title1]
        links += [elem.get_attribute('href') for elem in elems_id_links_title1]
        title1 += [elem.get_attribute('title') for elem in elems_id_links_title1]

        # Move to next page
        if not go_to_next_page(driver, count + 1, max_pages):
            break  # If there is no next page, exit the loop
        print("Crawl Page " + str(count) + " done!")
        count += 1  # Increment the page number to move to the next page

    except Exception as e:
        print(f"Error on page {count}: {e}")
        break  # If there is an error on the current page, stop the loop

df1 = pd.DataFrame(list(zip(id, links, title1)), columns = ['id', 'links', 'title1'])
df1['index_'] = np.arange(1, len(df1) + 1)

df1.to_csv('df1.csv', index=False, encoding='utf-8')

# ============================GET INFOMATION OF ALL ITEMS
def getDetailItems(link):
    driver.get(link)

    title2 , title3, district, city, floors, price, price_per_m2, area, balcony, toilets, bedrooms, street_frontage, legal, interior, date = [], [], [], [], [], [], [], [], [], [], [], [], [], [], []

    # ================================ GET title2
    elems_title2 = driver.find_elements(By.CSS_SELECTOR, '.re__link-se[level="4"]')
    title2 = [elem.text for elem in elems_title2]

    # ================================ GET district
    elems_district = driver.find_elements(By.CSS_SELECTOR, '.re__link-se[level="3"]')
    district = [elem.text for elem in elems_district]

    # ================================ GET city
    elems_city = driver.find_elements(By.CSS_SELECTOR, '.re__link-se[level="2"]')
    city = [elem.text for elem in elems_city]

    # ================================ GET title3
    elems_title3 = driver.find_elements(By.CSS_SELECTOR, ".re__pr-short-description.js__pr-address")
    title3 = [elem.text for elem in elems_title3]

    # ================================ GET floors
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-specs-content-item")
    for item in items:
        title_floors = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text
        floors = [item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text]
        if title_floors == "Số tầng":
            found = True
            break
        else: found = False
        if found == False:
            floors = ["N/A"]

    # ================================ GET price
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-short-info-item.js__pr-short-info-item")
    for item in items:
        title_price = item.find_element(By.CSS_SELECTOR, ".title").text
        price = [item.find_element(By.CSS_SELECTOR, ".value").text]
        if title_price == "Mức giá":
            found = True
            break
        else: found = False
        if found == False:
            price = ["N/A"]

    # ================================ GET area
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-short-info-item.js__pr-short-info-item")
    for item in items:
        title_area = item.find_element(By.CSS_SELECTOR, ".title").text
        area = [item.find_element(By.CSS_SELECTOR, ".value").text]
        if title_area == "Diện tích":
            found = True
            break
        else: found = False
        if found == False:
            area = ["N/A"]

    # ================================ GET price_per_m2
    try:
        items = driver.find_elements(By.CLASS_NAME, "re__pr-short-info-item")
        for item in items:
            if "triệu/m²" in item.text:
                price_per_m2 = [item.find_element(By.CLASS_NAME, "ext").text]
                break
            else: price_per_m2 = ["N/A"]
    except ValueError:
        price_per_m2 = ["N/A"]

    # ================================ GET balcony
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-specs-content-item")
    for item in items:
        title_balcony = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text
        balcony = [item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text]
        if title_balcony == "Hướng ban công":
            found = True
            break
        else: found = False
        if found == False:
            balcony = ["N/A"]

     # ================================ GET toilets
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-specs-content-item")
    for item in items:
        title_toilets = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text
        toilets = [item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text]
        if title_toilets == "Số phòng tắm, vệ sinh":
            found = True
            break
        else: found = False
        if found == False:
            toilets = ["N/A"]

     # ================================ GET bedrooms
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-specs-content-item")
    for item in items:
        title_bedrooms = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text
        bedrooms = [item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text]
        if title_bedrooms == "Số phòng ngủ":
            found = True
            break
        else: found = False
        if found == False:
            bedrooms = ["N/A"]

    # ================================ GET street frontage
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-specs-content-item")
    for item in items:
        title_street_frontage = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text
        street_frontage = [item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text]
        if title_street_frontage == "Mặt tiền":
            found = True
            break
        else: found = False
        if found == False:
            street_frontage = ["N/A"]

    # ================================ GET legal
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-specs-content-item")
    for item in items:
        title_legal = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text
        legal = [item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text]
        if title_legal== "Pháp lý":
            found = True
            break
        else: found = False
        if found == False:
            legal = ["N/A"]

    # ================================ GET interior
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-specs-content-item")
    for item in items:
        title_interior = item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-title").text
        interior = [item.find_element(By.CSS_SELECTOR, ".re__pr-specs-content-item-value").text]
        if title_interior== "Nội thất":
           found = True
           break
        else: found = False
        if found == False:
            interior = ["N/A"]

     # ================================ GET date
    items = driver.find_elements(By.CSS_SELECTOR, ".re__pr-short-info-item.js__pr-config-item")
    for item in items:
        title_date = item.find_element(By.CSS_SELECTOR, ".title").text
        date = [item.find_element(By.CSS_SELECTOR, ".value").text]
        if title_date == "Ngày đăng":
            found = True
            break
        else: found = False
        if found == False:
            date = ["N/A"]
    print ("Crawl Row " + str (count_rows) + " done!")

    df_temp = pd.DataFrame(list(zip(price, price_per_m2, area, title2 , title3, district, city, floors, toilets, bedrooms, balcony, street_frontage, legal, interior, date)),
                   columns = ['price', 'price_per_m2', 'area', 'title2' , 'title3', 'district', 'city', 'floors', 'toilets', 'bedrooms', 'balcony', 'street_frontage', 'legal', 'interior', 'date'])
    df_temp.insert(0, "links", link)
    return df_temp

df2 = pd.DataFrame()
count_rows = 1
for link in links:
    print ("Crawl Row " + str (count_rows))
    df_temp = getDetailItems(link)
    df2 = pd.concat([df2, df_temp], ignore_index=True)
    count_rows += 1

df2.to_csv('df2.csv', index=False, encoding='utf-8')

# Close the driver
driver.quit()

Crawl Page 1
Crawl Page 1 done!
Crawl Page 2
Crawl Page 2 done!
Crawl Page 3
Crawl Page 3 done!
Crawl Page 4
Crawl Page 4 done!
Crawl Page 5
Crawl Page 5 done!
Crawl Page 6
Crawl Page 6 done!
Crawl Page 7
Crawl Page 7 done!
Crawl Page 8
Crawl Page 8 done!
Crawl Page 9
Crawl Page 9 done!
Crawl Page 10
Crawl Page 10 done!
Crawl Page 11
Crawl Page 11 done!
Crawl Page 12
Crawl Page 12 done!
Crawl Page 13
Crawl Page 13 done!
Crawl Page 14
Crawl Page 14 done!
Crawl Page 15
Crawl Page 15 done!
Crawl Page 16
Crawl Page 16 done!
Crawl Page 17
Crawl Page 17 done!
Crawl Page 18
Crawl Page 18 done!
Crawl Page 19
Crawl Page 19 done!
Crawl Page 20
Crawl Page 20 done!
Crawl Page 21
Crawl Page 21 done!
Crawl Page 22
Crawl Page 22 done!
Crawl Page 23
Crawl Page 23 done!
Crawl Page 24
Crawl Page 24 done!
Crawl Page 25
Crawl Page 25 done!
Crawl Page 26
Crawl Page 26 done!
Crawl Page 27
Crawl Page 27 done!
Crawl Page 28
Crawl Page 28 done!
Crawl Page 29
Crawl Page 29 done!
Crawl Page 30
Crawl Page 30 done

ReadTimeoutError: HTTPConnectionPool(host='localhost', port=24789): Read timed out. (read timeout=120)

In [2]:
# Merge df1 and df2 based on 'links'
df3 = pd.merge(df1, df2, how='left', on='links')
df = pd.merge(df1, df2, how='left', on='links')

df = df.drop_duplicates(keep='first')

# Save to file csv
df3.to_csv('df.csv', index=False, encoding='utf-8')
df.to_csv('df.csv', index=False, encoding='utf-8')


In [3]:
print(f"Unique links in df1: {df1['links'].nunique()}")
print(f"Unique links in df2: {df2['links'].nunique()}")
print(f"Unique links after merge: {df['links'].nunique()}")

Unique links in df1: 48052
Unique links in df2: 3908
Unique links after merge: 48052
