In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import os


In [11]:
def download_webpage_with_selenium(url, output_file="webpage.html"):
    """
    Downloads a webpage's HTML content using Selenium to bypass basic scraping protections.
    
    Args:
        url (str): The URL to download
        output_file (str): Path to save the HTML content
        
    Returns:
        str: Path to the saved HTML file if successful
    """
    try:
        print(f"Setting up Chrome browser to access: {url}")
        
        # Configure Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode (no UI)
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--no-sandbox")
        
        # Add user agent to appear as a regular browser
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
        
        # Initialize the Chrome driver
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        # Set page load timeout
        driver.set_page_load_timeout(30)
        
        print("Accessing webpage...")
        driver.get(url)
        
        # Wait for JavaScript to load
        time.sleep(5)
        
        # Get the page source
        html_content = driver.page_source
        
        # Save the HTML to a file
        with open(output_file, "w", encoding="utf-8") as file:
            file.write(html_content)
            
        print(f"HTML content successfully saved to {output_file}")
        print(f"HTML size: {len(html_content)} characters")
        
        # Close the browser
        driver.quit()
        
        return output_file
        
    except Exception as e:
        print(f"Error: {e}")
        return None


In [12]:
# URL to download
target_url = "https://www.zonaprop.com.ar/departamentos-alquiler-la-plata-1-habitacion-desde-2-hasta-3-ambientes-1-garage.html"

# File to save the HTML to
output_file = "zonaprop_results.html"

# Download the webpage
result = download_webpage_with_selenium(target_url, output_file)

if result:
    # Get the file size
    file_size = os.path.getsize(output_file) / 1024  # KB
    print(f"File saved: {output_file} ({file_size:.2f} KB)")
    
    # Preview the first 300 characters
    with open(output_file, "r", encoding="utf-8") as file:
        preview = file.read(300)
        print("\nHTML Preview:")
        print(preview + "...\n")
else:
    print("Failed to download the webpage.")


Setting up Chrome browser to access: https://www.zonaprop.com.ar/departamentos-alquiler-la-plata-1-habitacion-desde-2-hasta-3-ambientes-1-garage.html
Accessing webpage...
HTML content successfully saved to zonaprop_results.html
HTML size: 738043 characters
File saved: zonaprop_results.html (721.73 KB)

HTML Preview:
<html lang="es-AR" class="DEFAULT"><head>
			<meta charset="utf-8">
			<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=5.0 shrink-to-fit=no">
			<title>Departamentos con 1 Dormitorio con desde 2 hasta 3 Ambientes con mas de 1 Cochera en alquiler en La Plata, GBA S...



In [13]:
import requests
from bs4 import BeautifulSoup

def extract_elements_by_class(html_content, class_name):
    """
    Extract all elements and their children with a specific class name from HTML content.
    
    Args:
        html_content (str): The HTML content to parse
        class_name (str): The class name to search for
        
    Returns:
        list: A list of elements (as strings) that have the specified class
    """
    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all elements with the specified class
    elements = soup.find_all(class_=class_name)
    
    # Convert the elements to strings to preserve their structure including children
    extracted_elements = [str(element) for element in elements]
    
    return extracted_elements

def extract_from_file(file_path, class_name):
    """
    Extract elements with a specific class name from an HTML file.
    
    Args:
        file_path (str): Path to the HTML file
        class_name (str): The class name to search for
        
    Returns:
        list: A list of elements (as strings) that have the specified class
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        return extract_elements_by_class(html_content, class_name)
    except IOError as e:
        print(f"Error reading file: {e}")
        return []

source = "zonaprop_results.html"
class_name = "postingCardLayout-module__posting-card-layout"

elements = extract_from_file(source, class_name)

print(f"Found {len(elements)} elements with class '{class_name}':")
for i, element in enumerate(elements, 1):
    print(f"\nElement {i}:")
    print(element)

Found 30 elements with class 'postingCardLayout-module__posting-card-layout':

Element 1:
<div class="postingCardLayout-module__posting-card-layout" data-id="55835970" data-posting-type="PROPERTY" data-qa="posting PROPERTY" data-to-posting="/propiedades/clasificado/alclapin-departamento-en-alquiler-1-dormitorio-1-bano-47-55835970.html"><div class="postingCardLayout-module__posting-card-container"><div class="postingGallery-module__gallery-container postingGallery-module__container-width postingGallery-module__tablet-width" data-qa="POSTING_CARD_GALLERY"><div class="lazyload-wrapper" style="width:100%;height:100%;position:absolute"><div class="gallery-container"><div class="multimediaGallery flickity-enabled is-draggable" tabindex="0"><div class="flickity-viewport" style="height: 260px; touch-action: pan-y;"><div class="flickity-slider" style="left: 0px; transform: translateX(0%);"><img alt="Departamento · 47m² · 2 Ambientes · 1 Cochera · Departamento en Alquiler - 1 Dormitorio 1 Baño -

In [20]:
import re
import json
from bs4 import BeautifulSoup
from math import ceil

def parse_apartment_listings(html_elements):
    """
    Parse a list of HTML apartment listing elements and convert them to structured JSON.
    
    Args:
        html_elements (list): List of HTML strings representing apartment listings
        
    Returns:
        list: A list of dictionaries containing structured apartment information
    """
    apartments = []
    
    for html_element in html_elements:
        # Parse the HTML element
        soup = BeautifulSoup(html_element, 'html.parser')
        
        # Initialize apartment data dictionary
        apartment_data = {
            "id": None,
            "price": {
                "monthly_rent": None,
                "maintenance_fee": None,
                "total_price": None,
                "price_per_sqm": None
            },
            "location": {
                "address": None,
                "neighborhood": None
            },
            "specifications": {},
            "features": [],
            "url": None,
            "description": None,
            "agency": None,
            "images": []
        }
        
        # Extract ID from data attribute
        if 'data-id' in soup.div.attrs:
            apartment_data["id"] = soup.div['data-id']
        
        # Extract URL from data attribute or anchor tag
        url_element = soup.select_one('a[href]')
        if url_element and 'href' in url_element.attrs:
            apartment_data["url"] = "https://www.zonaprop.com.ar" + url_element['href']
        
        # Extract price information
        price_element = soup.select_one('.postingPrices-module__price')
        if price_element:
            apartment_data["price"]["monthly_rent"] = price_element.text.strip()
        
        # Extract maintenance fee
        expenses_element = soup.select_one('.postingPrices-module__expenses')
        if expenses_element:
            apartment_data["price"]["maintenance_fee"] = expenses_element.text.strip()

        # Extract address
        address_element = soup.select_one('.postingLocations-module__location-address')
        if address_element:
            apartment_data["location"]["address"] = address_element.text.strip()
        
        # Extract neighborhood
        location_element = soup.select_one('.postingLocations-module__location-text')
        if location_element:
            apartment_data["location"]["neighborhood"] = location_element.text.strip()
        
        # Extract specifications
        features_container = soup.select_one('.postingMainFeatures-module__posting-main-features-block')
        if features_container:
            feature_spans = features_container.select('.postingMainFeatures-module__posting-main-features-span')
            for span in feature_spans:
                text = span.text.strip()
                if 'm²' in text:
                    # Extract only the numeric value and unit
                    area_match = re.search(r'(\d+)\s*m²', text)
                    if area_match:
                        apartment_data["specifications"]["total_area_m2"] = int(area_match.group(1))
                    else:
                        apartment_data["specifications"]["total_area"] = text
                elif 'amb.' in text:
                    # Extract number of rooms
                    rooms_match = re.search(r'(\d+)\s*amb', text)
                    if rooms_match:
                        apartment_data["specifications"]["rooms"] = int(rooms_match.group(1))
                    else:
                        apartment_data["specifications"]["rooms_text"] = text
                elif 'dorm.' in text:
                    # Extract number of bedrooms
                    bedrooms_match = re.search(r'(\d+)\s*dorm', text)
                    if bedrooms_match:
                        apartment_data["specifications"]["bedrooms"] = int(bedrooms_match.group(1))
                    else:
                        apartment_data["specifications"]["bedrooms_text"] = text
                elif 'baño' in text:
                    # Extract number of bathrooms
                    bathrooms_match = re.search(r'(\d+)\s*baño', text)
                    if bathrooms_match:
                        apartment_data["specifications"]["bathrooms"] = int(bathrooms_match.group(1))
                    else:
                        apartment_data["specifications"]["bathrooms_text"] = text
                elif 'coch.' in text:
                    # Extract number of parking spaces
                    parking_match = re.search(r'(\d+)\s*coch', text)
                    if parking_match:
                        apartment_data["specifications"]["parking_spaces"] = int(parking_match.group(1))
                    else:
                        apartment_data["specifications"]["parking_text"] = text
        
        # Extract description
        description_element = soup.select_one('.postingCard-module__posting-description a')
        if description_element:
            apartment_data["description"] = description_element.text.strip()
            
            # Extract additional details from description
            desc_text = description_element.text.lower()
            
            # Check for elevator
            if 'ascensor' in desc_text:
                apartment_data["features"].append("elevator")
            
            # Check for floor level
            if 'piso alto' in desc_text:
                apartment_data["specifications"]["floor_level"] = "high"
                        
            # Check for balcony
            if 'balcón' in desc_text:
                apartment_data["features"].append("balcony")
            
            # Check for built-in closet
            if 'placard' in desc_text:
                apartment_data["features"].append("built_in_closet")
            
            # Check for kitchen features
            if 'cocina equipada' in desc_text:
                apartment_data["features"].append("equipped_kitchen")
            
            # Check for brightness and spaciousness
            if 'luminoso' in desc_text:
                apartment_data["features"].append("bright")
            if 'amplio' in desc_text:
                apartment_data["features"].append("spacious")
                    
        # Extract real estate agency
        agency_logo = soup.select_one('.postingPublisher-module__logo')
        if agency_logo and 'alt' in agency_logo.attrs:
            apartment_data["agency"] = agency_logo['alt'].replace('logo publisher', '').strip()
        
        # Extract image URLs
        #img_elements = soup.select('img[src]')
        #for img in img_elements:
        #    if 'src' in img.attrs and 'data-flickity-lazyload' not in img.attrs and 'zonaprop' in img['src']:
                # Filter out navigation icons and other non-property images
        #        if 'avisos' in img['src']:
        #            apartment_data["images"].append(img['src'])
        
        # Check for laundry facilities
        laundry_element = soup.select_one('.pills-module__trigger-pill-item-span')
        if laundry_element and 'lavadero' in laundry_element.text.lower():
            apartment_data["features"].append("laundry")
        
        # Calculate total_price and price_per_sqm
        def parse_price(price_str):
            if not price_str:
                return None
            price_num = re.sub(r"[^\d]", "", price_str)
            return int(price_num) if price_num else None

        monthly_rent = parse_price(apartment_data["price"].get("monthly_rent"))
        maintenance_fee = parse_price(apartment_data["price"].get("maintenance_fee"))
        area = apartment_data["specifications"].get("total_area_m2")

        if monthly_rent is not None:
            total_price = monthly_rent + (maintenance_fee if maintenance_fee else 0)
            apartment_data["price"]["total_price"] = total_price
            if area:
                apartment_data["price"]["price_per_sqm"] = ceil(total_price / area)
        
        # Clean up empty sections and None values
        for key in list(apartment_data.keys()):
            if apartment_data[key] is None:
                del apartment_data[key]
            elif isinstance(apartment_data[key], dict):
                # Remove empty nested dictionaries or dictionaries with only None values
                nested_dict = apartment_data[key]
                for nested_key in list(nested_dict.keys()):
                    if nested_dict[nested_key] is None:
                        del nested_dict[nested_key]
                if not nested_dict:
                    del apartment_data[key]
            elif isinstance(apartment_data[key], list) and not apartment_data[key]:
                del apartment_data[key]
        
        apartments.append(apartment_data)
    
    return apartments

def process_apartment_listings(file_path):
    # Extract the HTML elements
    elements = extract_from_file(file_path, "postingCardLayout-module__posting-card-layout")
    
    # Parse the elements into structured data
    apartments = parse_apartment_listings(elements)
    
    # Convert to JSON
    return json.dumps(apartments, indent=2, ensure_ascii=False)

# Usage:
result = process_apartment_listings("zonaprop_results.html")
# # Save to file
with open("apartments.json", "w", encoding="utf-8") as f:
    f.write(result)