In [2]:
# 1) Importing libraries
from selenium import webdriver # Web browser automation for web scraping and automated testing.
from selenium.webdriver.firefox.service import Service # Browser Service Control for Selenium.
from webdriver_manager.firefox import GeckoDriverManager # Automatic Firefox driver management for Selenium.
from selenium.webdriver.firefox.options import Options as FirefoxOptions

from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup # HTML document analysis  for web scraping.

import time

import re

import math

In [3]:
# 2) Set houses and apartments price range
lower_price = 400000
upper_price = 2000000
# Blank list where house and apartments ads will be stored.
house_apartment_list = []

In [4]:
# 3) Web page URL from which the data will be extracted 
def get_page_url(page_number,rango_i,rango_s):
    
    base_page_url = "https://www.encuentra24.com/guatemala-es/bienes-raices-venta-de-propiedades-casas/guatemala-es-guatemala.{}?q=f_price.{}-{}|f_currency.GTQ|withcat.bienes-raices-venta-de-propiedades-casas,bienes-raices-venta-de-propiedades-apartamentos".format(page_number,rango_i,rango_s)
    
    return base_page_url

In [5]:
# 3) Create a driver object representing an automatically controlled Firefox web browser.
driver = webdriver.Firefox()
# Browser window maximize.
driver.maximize_window()
# Go to the page want to Scrape.
driver.get(get_page_url(1,lower_price,upper_price))
# Pause to ensure completely page loads. 
time.sleep(5)
# Get HTML code from loaded page.
html_code = driver.page_source
# Parse HTML with BeautifulSoup.
soup = BeautifulSoup(html_code, 'html.parser')

In [6]:
# 4) Function that allows you to extract the basic information of each ad (title, price, square meters, 
# number of rooms, parking spaces and bathrooms, location, image url and ad url
def div_html_to_obj(house_div):
    # We first try to find "data-original" attribute for the image, if it does not exist, we look for "src".
    try:
        img_url = house_div.find("img")['data-original']
    except:
        img_url = house_div.find("img")['src']  
    
    # Extract ad location.
    location = house_div.find(class_='d3-ad-tile__location').text.strip()
    
    # Extract ad title.
    title = house_div.find(class_='d3-ad-tile__title').text
    
    # Get the element that contains price.
    price_dirt = house_div.find(class_="d3-ad-tile__price").text
    
    # Define a regular expression to extract main price.
    price_regex = r'([Q$]\d[\d,.]+)'
    
    # Finding price using regular expressions.
    original_price = re.findall(price_regex, price_dirt)
    
    # Get ad attributes text (details).
    attributes = house_div.find(class_='d3-ad-tile__details').find_all('li', class_='d3-ad-tile__details-item')
    
    # Initialize variables for square meters, rooms, parking spaces and bathrooms.
    square_meters = None
    rooms = None
    parking = None
    baths = None
    
    # Iterate over details to extract information.
    for attribute in attributes:
        icon = attribute.find('svg', class_='d3-icon d3-ad-tile__icon')
        value = attribute.text.strip()
        
        # If find a corresponding icon, we identify the type of detail.
        if icon:
            icon_use = icon.find('use').get('xlink:href')
            
            # Compare icon with known types to extract corresponding information.
            if 'resize' in icon_use:  # Square meters
                square_meters = int(re.search(r'\d+', value).group())
            elif 'bed' in icon_use:  # Rooms
                rooms = int(value)
            elif 'parking' in icon_use:  # Parking
                parking = int(value)
            elif 'bath' in icon_use:  # Baths
                baths = int(value)
    
    # Build full ad URL.
    url = 'https://www.encuentra24.com' + house_div.find("a")["href"]
    
    # Return a dictionary with all extracted information.
    return {
        "title": title,
        "price": original_price,
        "square_meters": square_meters,
        "rooms": rooms,
        "parking": parking,
        "baths": baths,
        "location": location,
        "img_url": img_url,
        "url": url
    }

In [8]:
def parser_and_save_items(page_number):
    # Navigate to specified page number and retrieve HTML source.
    driver.get(page_number)
    html_code = driver.page_source
    
    # Parse HTML using BeautifulSoup.
    soup = BeautifulSoup(html_code, 'html.parser')
    
    # Find all house and apartments advertisement div elements.
    all_house_div = soup.find_all("div", class_="d3-ad-tile")
    
    # Iterate over each house div, parse its content using div_html_to_obj, and append the result to house_apartment_list.
    for house_div in all_house_div:
        a = div_html_to_obj(house_div)
        house_apartment_list.append(a)

In [9]:
# 5) uses Selenium to navigate to a specific page, capture the HTML code, and then uses BeautifulSoup to parse it.
def web_page_number():
    # Extract ads total number and ads per page from HTML.
    # Position 1 shows total ads number, and position 5 shows ads number per page.    
    total_number_ads = int(soup.find(class_ = "d3-category-list__results").text.replace(",","").split()[5])
    total_ads_page = int(soup.find(class_ = "d3-category-list__results").text.split()[3])
    
    # Calculate total pages number needed, rounding up to the nearest integer.

    division = float(total_number_ads/total_ads_page)
    
    page_amount = math.ceil(division)
    
    return page_amount

In [None]:
# 6) Extracts total ads number and web page ads per page, calculates total pages number needed, rounding up to the nearest integer.
def web_page_number():
    # Extract ads total number and ads per page from HTML.
    # Position 1 shows total ads number, and position 5 shows ads number per page.    
    total_number_ads = int(soup.find(class_ = "d3-category-list__results").text.replace(",","").split()[5])
    total_ads_page = int(soup.find(class_ = "d3-category-list__results").text.split()[3])
    
    # Calculate total pages number needed, rounding up to the nearest integer.

    division = float(total_number_ads/total_ads_page)
    
    page_amount = math.ceil(division)
    
    return page_amount

In [1]:
# 7) Get basic information from each ad.
def div_html_to_obj(house_div):
    # Try to find URL image from 'data-original' attribute; if it doesn't exist, fallback to 'src'.
    try:
        img_url = house_div.find("img")['data-original']
    except:
        img_url = house_div.find("img")['src']  

    # Extract location text from 'div' class: 'd3-ad-tile__location'.
    location = house_div.find(class_='d3-ad-tile__location').text.strip()
    # Extract title text from 'div' class: 'd3-ad-tile__title'.
    title = house_div.find(class_='d3-ad-tile__title').text
    
    # Extract raw price text from 'div' class: 'd3-ad-tile__price'.
    price_dirt = house_div.find(class_="d3-ad-tile__price").text

    # Define a regular expression to extract price value (in Q or $).
    price_regex = r'([Q$]\d[\d,.]+)'

    # Find price using regular expression.
    original_price = re.findall(price_regex, price_dirt)

    # Extract property details from 'div' class: 'd3-ad-tile__details'.
    attributes = house_div.find(class_='d3-ad-tile__details').find_all('li', class_='d3-ad-tile__details-item')

    # Initialize variables for details property.
    square_meters = None
    rooms = None
    parking = None
    baths = None

    # Iterate over attributes to extract specific property information.
    for attribute in attributes:
        icon = attribute.find('svg', class_='d3-icon d3-ad-tile__icon')
        value = attribute.text.strip()

        if icon:
            # Get icon type from 'xlink:href' attribute.
            icon_use = icon.find('use').get('xlink:href')
            # Determine attribute type and extract numeric value.
            if 'resize' in icon_use:  # Square meters.
                square_meters = int(re.search(r'\d+', value).group())
            elif 'bed' in icon_use:  # Rooms number.
                rooms = int(value)
            elif 'parking' in icon_use:  # Parking number.
                parking = int(value)
            elif 'bath' in icon_use:  # Baths number.
                baths = int(value)

    # Create property URL listing.
    url = 'https://www.encuentra24.com' + house_div.find("a")["href"]

    # Return extracted information as a dictionary.
    return {
        "title": title,
        "price": original_price,
        "square_meters": square_meters,
        "rooms": rooms,
        "parking": parking,
        "baths": baths,
        "location": location,
        "img_url": img_url,
        "url": url
    }

12