In [9]:
# 1) Importing libraries
from selenium import webdriver # Web browser automation for web scraping and automated testing.
from selenium.webdriver.firefox.service import Service # Browser Service Control for Selenium.
from webdriver_manager.firefox import GeckoDriverManager # Automatic Firefox driver management for Selenium.
from selenium.webdriver.firefox.options import Options as FirefoxOptions

from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup # HTML document analysis  for web scraping.

import time

import re

In [2]:
# 2) Set houses and apartments price range
lower_price = 400000
upper_price = 2000000

In [3]:
# 3) Web page URL from which the data will be extracted 
def get_page_url(page_number,rango_i,rango_s):
    
    base_page_url = "https://www.encuentra24.com/guatemala-es/bienes-raices-venta-de-propiedades-casas/guatemala-es-guatemala.{}?q=f_price.{}-{}|f_currency.GTQ|withcat.bienes-raices-venta-de-propiedades-casas,bienes-raices-venta-de-propiedades-apartamentos".format(page_number,rango_i,rango_s)
    
    return base_page_url

In [4]:
# 3) Create a driver object representing an automatically controlled Firefox web browser.
driver = webdriver.Firefox()
# Browser window maximize.
driver.maximize_window()
# Go to the page want to Scrape.
driver.get(get_page_url(1,lower_price,upper_price))
# Pause to ensure completely page loads. 
time.sleep(5)
# Get HTML code from loaded page.
html_code = driver.page_source
# Parse HTML with BeautifulSoup.
soup = BeautifulSoup(html_code, 'html.parser')

In [10]:
def div_html_to_obj(house_div):
    # We first try to find "data-original" attribute for the image, if it does not exist, we look for "src".
    try:
        img_url = house_div.find("img")['data-original']
    except:
        img_url = house_div.find("img")['src']  
    
    # Extract ad location.
    location = house_div.find(class_='d3-ad-tile__location').text.strip()
    
    # Extract ad title.
    title = house_div.find(class_='d3-ad-tile__title').text
    
    # Get the element that contains price.
    price_dirt = house_div.find(class_="d3-ad-tile__price").text
    
    # Define a regular expression to extract main price.
    price_regex = r'([Q$]\d[\d,.]+)'
    
    # Finding price using regular expressions.
    original_price = re.findall(price_regex, price_dirt)
    
    # Get ad attributes text (details).
    attributes = house_div.find(class_='d3-ad-tile__details').find_all('li', class_='d3-ad-tile__details-item')
    
    # Initialize variables for square meters, rooms, parking spaces and bathrooms.
    square_meters = None
    rooms = None
    parking = None
    baths = None
    
    # Iterate over details to extract information.
    for attribute in attributes:
        icon = attribute.find('svg', class_='d3-icon d3-ad-tile__icon')
        value = attribute.text.strip()
        
        # If find a corresponding icon, we identify the type of detail.
        if icon:
            icon_use = icon.find('use').get('xlink:href')
            
            # Compare icon with known types to extract corresponding information.
            if 'resize' in icon_use:  # Square meters
                square_meters = int(re.search(r'\d+', value).group())
            elif 'bed' in icon_use:  # Rooms
                rooms = int(value)
            elif 'parking' in icon_use:  # Parking
                parking = int(value)
            elif 'bath' in icon_use:  # Baths
                baths = int(value)
    
    # Build full ad URL.
    url = 'https://www.encuentra24.com' + house_div.find("a")["href"]
    
    # Return a dictionary with all extracted information.
    return {
        "title": title,
        "price": original_price,
        "square_meters": square_meters,
        "rooms": rooms,
        "parking": parking,
        "baths": baths,
        "location": location,
        "img_url": img_url,
        "url": url
    }