## Setting up

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import random
import re
# Setup ChromeDriver
options = webdriver.ChromeOptions()
# options.add_argument('--disable-blink-features=AutomationControlled')  # Disable automation flags
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option(
        "prefs", {
            # block image loading
            "profile.managed_default_content_settings.images": 2,
        }
    )
# options.add_argument('--headless')  # Uncomment for headless mode

service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=options)
# wait = WebDriverWait(driver, 10)

## Supporting function

In [11]:
def remove_unwanted_div():
    script = """
    var element = document.querySelector('.re__listing-verified-similar-v2.js__listing-verified-similar');
    if (element) {
        element.remove();
    }
    """
    driver.execute_script(script)
def remove_search_form():
    script = """
    var element = document.querySelector('#boxSearchForm');
    if (element) {
        element.remove();
    }
    """
    driver.execute_script(script)

def scroll_to_bottom():
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    human_like_delay(1, 3)
def human_like_delay(min_time=1, max_time=3):
    time.sleep(random.uniform(min_time, max_time))


## Scraping function(main page)

In [12]:
data_list = []
STANDARD_KEYS = ['Diện tích', 'Mức giá', 'Mặt tiền', 'Đường vào', 'Hướng nhà', 'Hướng ban công', 
                 'Số phòng ngủ', 'Số toilet', 'Pháp lý', 'Số tầng', 'Nội thất','Longitude','Latitude']

In [13]:
def scrape_main_page(url):
    driver.get(url)
    scroll_to_bottom()
    human_like_delay(2,4)
    wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'js__product-link-for-product-id')))
    remove_unwanted_div()
    cards = driver.find_elements(By.CLASS_NAME, 'js__product-link-for-product-id')

    for i in range(len(cards)-2):
        try:
            print(len(cards))
            print(f"Clicking on card {i + 1}...")
            wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'js__product-link-for-product-id')))
            remove_unwanted_div()
            cards = driver.find_elements(By.CLASS_NAME, 'js__product-link-for-product-id')
            remove_search_form()
            human_like_delay(2,3)

            driver.execute_script("arguments[0].scrollIntoView();", cards[i])
            human_like_delay(1, 2)
            cards[i].click()

            human_like_delay(1, 3)

            scrape_page()
            
            human_like_delay(2, 5)

        except Exception as e:
            print(f"An error occurred while clicking on card {i + 1}: {e}")
        finally:
            try:
                driver.back()
            except:
                driver.get(url) 

## Scraping function(card)

In [14]:
def extract_latitude_longitude():
    try:
        iframe = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'iframe[data-src]')))
        
        data_src = iframe.get_attribute('data-src')
        

        pattern = r"q=([\d.]+),([\d.]+)&"
        match = re.search(pattern, data_src)
        
        if match:
            latitude = match.group(1)
            longitude = match.group(2)
            print(f"Latitude: {latitude}, Longitude: {longitude}")
            return latitude, longitude
        else:
            print("Latitude and Longitude not found in the URL.")
            return None, None
    except Exception as e:
        print(f"Error extracting latitude and longitude: {e}")
        return None, None

In [15]:
def scrape_page():

    data = {}
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 're__pr-specs-content-item-value')))
        
        specs_items = driver.find_elements(By.CSS_SELECTOR, '.re__pr-specs-content-item')

        for item in specs_items:
                    title_element = item.find_element(By.CLASS_NAME, 're__pr-specs-content-item-title')
                    value_element = item.find_element(By.CLASS_NAME, 're__pr-specs-content-item-value')

                    title = title_element.text.strip()
                    value = value_element.text.strip()

                    data[title] = value

        latitude, longitude = extract_latitude_longitude()
        if latitude and longitude:
            data['Latitude'] = latitude
            data['Longitude'] = longitude
        for key in STANDARD_KEYS:
            if key not in data:
                data[key] = None
        print(data)
        data_list.append(data)
    except TimeoutException:
        print("Timeout reached while trying to load the card page.")

## Checkpoint

In [16]:
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10)
try:
    for i in range(5,152):
        try:
            if i ==1:
                url = 'https://batdongsan.com.vn/ban-loai-bat-dong-san-khac-da-nang?cIds=362'
            else:
                url = f'https://batdongsan.com.vn/ban-loai-bat-dong-san-khac-da-nang/p{i}?cIds=362'
            scrape_main_page(url)
        except:
            break
except Exception:
    print("An error occurred while scraping the main page.")
finally:
    print(data_list)
    df = pd.DataFrame(data_list)
    df.to_csv('scraped_data_only_gianha_batdongsancomvn.csv',encoding="utf-8", index=False)
    print("Saved successfully")
driver.quit()

23
Clicking on card 1...
Latitude: 16.0783986649086, Longitude: 108.236456036436
{'Diện tích': '76 m²', 'Mức giá': '8,5 tỷ', 'Mặt tiền': '45 m', 'Đường vào': '75 m', 'Số tầng': '2 tầng', 'Số phòng ngủ': '3 phòng', 'Số toilet': '2 phòng', 'Pháp lý': 'Sổ đỏ/ Sổ hồng', 'Nội thất': 'Cơ bản', 'Latitude': '16.0783986649086', 'Longitude': '108.236456036436', 'Hướng nhà': None, 'Hướng ban công': None}
23
Clicking on card 2...
Latitude: 16.0676815614428, Longitude: 108.234862780888
{'Diện tích': '79 m²', 'Mức giá': '6,8 tỷ', 'Mặt tiền': '4,5 m', 'Đường vào': '75 m', 'Hướng nhà': 'Đông', 'Hướng ban công': 'Đông', 'Số tầng': '4 tầng', 'Số phòng ngủ': '4 phòng', 'Số toilet': '4 phòng', 'Pháp lý': 'Sổ đỏ/ Sổ hồng', 'Nội thất': 'Đầy đủ', 'Latitude': '16.0676815614428', 'Longitude': '108.234862780888'}
23
Clicking on card 3...
Latitude: 16.0420371248445, Longitude: 108.243851399054
{'Diện tích': '97,3 m²', 'Mức giá': '5,5 tỷ', 'Mặt tiền': '4 m', 'Đường vào': '11,25 m', 'Hướng nhà': 'Tây - Nam', 'Hướn