In [60]:
# 1) Importing libraries
from selenium import webdriver # Web browser automation for web scraping and automated testing.
from selenium.webdriver.firefox.service import Service # Browser Service Control for Selenium.
from webdriver_manager.firefox import GeckoDriverManager # Automatic Firefox driver management for Selenium.
from selenium.webdriver.firefox.options import Options as FirefoxOptions # Run the browser in headless mode or set other preferences.
from selenium.webdriver.common.by import By # Locate elements on the web page in different ways, such as by ID, name, class, label, among others.
from selenium.webdriver.support.ui import WebDriverWait # Wait until a specific condition is met on the web page, such as an element being present or visible.
from selenium.webdriver.support import expected_conditions as EC # Expect certain events to occur on the web page.

from bs4 import BeautifulSoup # HTML document analysis  for web scraping.

import time # This module provides functions to manage time.

import re # Provides functions for working with regular expressions, which are used to find and manipulate patterns in text.

import math # Provides standard mathematical functions, such as arithmetic operations and algebra calculations.

import pandas as pd # It is a library for data manipulation and analysis in Python.

import requests # This module makes it easy to send HTTP requests in Python.

In [61]:
# 2) Set houses and apartments price range.
lower_price = 400000
upper_price = 2000000
# Blank list where house and apartments ads will be stored.
house_apartment_list = []
# Blank list where houses and apartments ads description will be stored.
list_description = []

In [62]:
# 3) Web page URL from which the data will be extracted.
def get_page_url(page_number,rango_i,rango_s):
    
    base_page_url = "https://www.encuentra24.com/guatemala-es/bienes-raices-venta-de-propiedades-casas/guatemala-es-guatemala.{}?q=f_price.{}-{}|f_currency.GTQ|withcat.bienes-raices-venta-de-propiedades-casas,bienes-raices-venta-de-propiedades-apartamentos".format(page_number,rango_i,rango_s)
    
    return base_page_url

In [63]:
# 3) Create a driver object representing an automatically controlled Firefox web browser.
driver = webdriver.Firefox()
# Browser window maximize.
driver.maximize_window()
# Go to the page want to Scrape.
driver.get(get_page_url(1,lower_price,upper_price))
# Pause to ensure completely page loads. 
time.sleep(5)
# Get HTML code from loaded page.
html_code = driver.page_source
# Parse HTML with BeautifulSoup.
soup = BeautifulSoup(html_code, 'html.parser')

In [64]:
# 4) Function that allows to extract basic information of each ad (title, price, square meters, 
# number of rooms, parking spaces and bathrooms, location, image url and ad url).
def div_html_to_obj(house_div):
    # We first try to find "data-original" attribute for the image, if it does not exist, we look for "src".
    try:
        img_url = house_div.find("img")['data-original']
    except:
        img_url = house_div.find("img")['src']  
    
    # Extract ad location.
    location = house_div.find(class_='d3-ad-tile__location').text.strip()
    
    # Extract ad title.
    title = house_div.find(class_='d3-ad-tile__title').text
    
    # Get the element that contains price.
    price_dirt = house_div.find(class_="d3-ad-tile__price").text
    
    # Define a regular expression to extract main price.
    price_regex = r'([Q$]\d[\d,.]+)'
    
    # Finding price using regular expressions.
    original_price = re.findall(price_regex, price_dirt)
    
    # Get ad attributes text (details).
    attributes = house_div.find(class_='d3-ad-tile__details').find_all('li', class_='d3-ad-tile__details-item')
    
    # Initialize variables for square meters, rooms, parking spaces and bathrooms.
    square_meters = None
    rooms = None
    parking = None
    baths = None
    
    # Iterate over details to extract information.
    for attribute in attributes:
        icon = attribute.find('svg', class_='d3-icon d3-ad-tile__icon')
        value = attribute.text.strip()
        
        # If find a corresponding icon, we identify the type of detail.
        if icon:
            icon_use = icon.find('use').get('xlink:href')
            
            # Compare icon with known types to extract corresponding information.
            if 'resize' in icon_use:  # Square meters
                square_meters = int(re.search(r'\d+', value).group())
            elif 'bed' in icon_use:  # Rooms
                rooms = int(value)
            elif 'parking' in icon_use:  # Parking
                parking = int(value)
            elif 'bath' in icon_use:  # Baths
                baths = int(value)
    
    # Build full ad URL.
    url = 'https://www.encuentra24.com' + house_div.find("a")["href"]
    
    # Return a dictionary with all extracted information.
    return {
        "title": title,
        "price": original_price,
        "square_meters": square_meters,
        "rooms": rooms,
        "parking": parking,
        "baths": baths,
        "location": location,
        "img_url": img_url,
        "url": url
    }

In [65]:
def parser_and_save_items(page_number):
    # Navigate to specified page number and retrieve HTML source.
    driver.get(page_number)
    html_code = driver.page_source
    
    # Parse HTML using BeautifulSoup.
    soup = BeautifulSoup(html_code, 'html.parser')
    
    # Find all house and apartments advertisement 'div' elements.
    all_house_div = soup.find_all("div", class_="d3-ad-tile")
    
    # Iterate over each house div, parse its content using div_html_to_obj, and append the result to house_apartment_list.
    for house_div in all_house_div:
        # Convert HTML div element to a dictionary object using 'div_html_to_obj' function.
        a = div_html_to_obj(house_div)
        # Append dictionary object to house_apartment_list.
        house_apartment_list.append(a)

In [66]:
# 5) Uses Selenium to navigate to a specific page, capture the HTML code, and then uses BeautifulSoup to parse it.
def web_page_number():
    # Extract ads total number and ads per page from HTML.
    # Position 1 shows total ads number, and position 5 shows ads number per page.    
    total_number_ads = int(soup.find(class_ = "d3-category-list__results").text.replace(",","").split()[5])
    total_ads_page = int(soup.find(class_ = "d3-category-list__results").text.split()[3])
    
    # Calculate pages total number needed, rounding up to the nearest integer.

    division = float(total_number_ads/total_ads_page)
    
    page_amount = math.ceil(division)
    
    return page_amount

In [67]:
# 6) Extracts ads total number and web page ads per page, calculates total pages number needed, rounding up to the nearest integer.
def web_page_number():
    # Extract ads total number and ads per page from HTML.
    # Position 1 shows total ads number, and position 5 shows ads number per page.    
    total_number_ads = int(soup.find(class_ = "d3-category-list__results").text.replace(",","").split()[5])
    total_ads_page = int(soup.find(class_ = "d3-category-list__results").text.split()[3])
    
    # Calculate pages total number needed, rounding up to the nearest integer.

    division = float(total_number_ads/total_ads_page)
    
    page_amount = math.ceil(division)
    
    return page_amount

In [68]:
# 7) Scrape and process property listings from a web page.
def parser_and_save_items(page_number):
    # Navigate to specified page number using web driver.
    driver.get(page_number)
    # Get HTML current page code.
    html_code = driver.page_source
    # Parse HTML code with BeautifulSoup to create a soup object.
    soup = BeautifulSoup(html_code, 'html.parser')
    # Find all 'div' elements class: 'd3-ad-tile' which represent individual property listings.
    all_house_div = soup.find_all("div", class_="d3-ad-tile")
    
    # Iterate over each div element found
    for house_div in all_house_div:
        # Convert HTML div element to a dictionary object using 'div_html_to_obj' function.
        a = div_html_to_obj(house_div)
        # Append dictionary object to house_apartment_list.
        house_apartment_list.append(a)

In [69]:
# 8) Iterate through each page number from 1 to pages total number.
for actual_page in range(1, web_page_number() + 1):
    # Generate URL for current page with specified price range.
    a = get_page_url(actual_page, lower_price, upper_price)

    # Scrape and process property listings from URL generated.
    parser_and_save_items(a)

In [70]:
# 9) Create DataFrame with basic ad information.
df = pd.DataFrame(house_apartment_list)

In [72]:
# 10) Scrape property descriptions function.
def obtain_description(ad_url):
    try:
        # Set up HTTP headers to mimic a real web browser request.
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        # Send an HTTP GET request to specified URL headers.
        response = requests.get(ad_url, headers=headers)
        # Ensure that request was successful; raise an error for unsuccessful status codes.
        response.raise_for_status()  

        # Parse page HTML content using BeautifulSoup.
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find HTML element containing property description.
        description_element = soup.find('div', class_='d3-property-about__text')
        
        if description_element:
            # Extract and clean up the description text.
            description = description_element.text.strip().replace("\n", " ")
            return description
        else:
            # Print message if description is not found.
            print(f"Description not found in {ad_url}")
            return None

    except Exception as e:
        # Print an error message if an exception occurs during the request or parsing.
        print(f"Error getting description of {ad_url}: {e}")
        return None

In [75]:
# 11) Obtain ads URLs that have a defined URL.

# Create a URLs list from `house_apartment_list` where 'url' field is defined.
ad_urls = [ad["url"] for ad in house_apartment_list if ad["url"]]

# Iterate over each URL in the list of URLs.
for i in ad_urls:
    # Obtain description for each URL ad.
    obt_description = obtain_description(i)
    # Append obtained description to 'list_description'.
    list_description.append(obt_description)

Description not found in https://www.encuentra24.com/guatemala-es/bienes-raices-venta-de-propiedades-apartamentos/vendo-apartamento-en-zona-11-acodi/25598378?q=f_price.400000-2000000|f_currency.GTQ|withcat.bienes-raices-venta-de-propiedades-casas,bienes-raices-venta-de-propiedades-apartamentos&regionslug=guatemala-es-guatemala&list=categoryregion&categoryslug=bienes-raices-venta-de-propiedades-casas
Description not found in https://www.encuentra24.com/guatemala-es/bienes-raices-venta-de-propiedades-casas/vendo-casa-en-zona-4-de-mixco-acodi/25598170?q=f_price.400000-2000000|f_currency.GTQ|withcat.bienes-raices-venta-de-propiedades-casas,bienes-raices-venta-de-propiedades-apartamentos&regionslug=guatemala-es-guatemala&list=categoryregion&categoryslug=bienes-raices-venta-de-propiedades-casas


In [80]:
# 12) Add descriptions to DataFrame.
df = df.assign(adv_description=list_description)

In [83]:
# 13) Save cleaned DataFrame.

# Specify directory path.
directory = 'C:\\Users\\DAV\\Documents\\Python\\Python_Project\\House_price_project\\data\\raw\\'
# Define the file name.
filename = 'raw-data.csv'
# Build full file path.
output_path = directory + filename
# Export DataFrame `df` to an CSV file named 'raw.csv' 
df.to_csv(output_path,index=False)