# Real Estate Web scrapping 

In [48]:
import requests 
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'}
url='https://tibiaan.com/'
response=requests.get(url, headers=headers)
response.status_code

200

In [61]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import re

def setup_driver():
    """
    Configure and initialize Chrome WebDriver with appropriate options.
    This function sets up the browser automation tool with optimized settings
    for web scraping while maintaining compatibility across different systems.
    """
    # Configure Chrome browser options for stable operation
    options = Options()
    options.add_argument("--no-sandbox")  # Bypass OS security model
    options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
    options.add_argument("--window-size=1920,1080")  # Set consistent window size
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
    
    try:
        # Initialize Chrome driver with automatic driver management
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        return driver
    except Exception as e:
        print(f"Error initializing Chrome driver: {e}")
        return None

def select_sale_filter(driver):
    """
    Navigate through the website interface to select the 'For Sale' filter.
    This is crucial for ensuring we only collect sale properties and not rentals.
    The function tries multiple methods to locate and activate the sale filter.
    """
    try:
        print("Step 1: Locating and selecting the Sale filter...")
        
        # Wait for page elements to fully load before interaction
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(3)  # Additional buffer for dynamic content
        
        # Primary method: Search for dropdown/select elements containing sale options
        print("   Searching for dropdown menu with Sale option...")
        try:
            # Locate all select elements on the page
            select_elements = driver.find_elements(By.TAG_NAME, "select")
            for select_elem in select_elements:
                select_obj = Select(select_elem)
                options = select_obj.options
                
                # Examine each option in the dropdown
                for option in options:
                    option_text = option.text.lower()
                    print(f"      Found dropdown option: '{option.text}'")
                    
                    # Check if this option represents sale properties
                    if 'sale' in option_text or 'for sale' in option_text:
                        print(f"   Successfully located Sale option: '{option.text}'")
                        select_obj.select_by_visible_text(option.text)
                        time.sleep(3)  # Allow filter to process
                        
                        # Look for associated submit/search button
                        submit_buttons = driver.find_elements(By.CSS_SELECTOR, 
                            "button[type='submit'], input[type='submit'], .btn, .search-btn, button")
                        
                        # Attempt to click an appropriate submit button
                        for btn in submit_buttons:
                            try:
                                if btn.is_displayed() and btn.is_enabled():
                                    btn_text = btn.text.lower()
                                    if 'search' in btn_text or 'submit' in btn_text or 'go' in btn_text or btn_text == '':
                                        print(f"   Clicking submit button: '{btn.text}'")
                                        btn.click()
                                        time.sleep(5)
                                        return True
                            except:
                                continue
                        
                        # If no explicit button found, assume auto-submit behavior
                        time.sleep(3)
                        return True
        except Exception as e:
            print(f"   Dropdown method unsuccessful: {e}")
        
        # Secondary method: Search for radio button filters
        print("   Searching for radio button filters...")
        try:
            radio_buttons = driver.find_elements(By.CSS_SELECTOR, "input[type='radio']")
            for radio in radio_buttons:
                # Examine the context around each radio button
                parent = radio.find_element(By.XPATH, "./..")
                parent_text = parent.text.lower()
                
                # Select radio button if it relates to sale properties
                if 'sale' in parent_text:
                    print(f"   Found Sale radio button: '{parent.text}'")
                    radio.click()
                    time.sleep(3)
                    return True
        except Exception as e:
            print(f"   Radio button method unsuccessful: {e}")
        
        # Tertiary method: Look for clickable elements containing "Sale" text
        print("   Searching for clickable Sale elements...")
        try:
            sale_elements = driver.find_elements(By.XPATH, 
                "//button[contains(text(), 'Sale')] | //a[contains(text(), 'Sale')] | //div[contains(text(), 'Sale')] | //span[contains(text(), 'Sale')]")
            
            for element in sale_elements:
                if element.is_displayed():
                    element_text = element.text.strip()
                    print(f"   Found Sale element: '{element_text}'")
                    
                    # Click on relevant sale-related elements
                    if element_text.lower() in ['sale', 'for sale', 'properties for sale']:
                        print(f"   Clicking Sale element: '{element_text}'")
                        element.click()
                        time.sleep(5)
                        return True
        except Exception as e:
            print(f"   Clickable elements method unsuccessful: {e}")
        
        # Final method: Search for tab-based navigation
        print("   Searching for Sale tab navigation...")
        try:
            tab_selectors = [
                "a[href*='sale']", 
                "button[data-tab*='sale']",
                ".tab-sale",
                "#sale-tab",
                "a[href='#sale']"
            ]
            
            for selector in tab_selectors:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    if element.is_displayed():
                        print(f"   Found Sale tab with selector: {selector}")
                        element.click()
                        time.sleep(5)
                        return True
        except Exception as e:
            print(f"   Tab navigation method unsuccessful: {e}")
        
        print("   Warning: Could not locate sale filter, proceeding with all properties")
        return False
        
    except Exception as e:
        print(f"Error in sale filter selection: {e}")
        return False

def detect_pagination(driver):
    """
    Analyze the page structure to determine the total number of pages available.
    This is essential for ensuring we collect all properties across multiple pages
    rather than just the first page of results.
    """
    try:
        print("Step 2: Analyzing pagination structure...")
        
        # Allow time for pagination elements to load after filter selection
        time.sleep(3)
        
        # Search for common pagination element patterns
        pagination_selectors = [
            ".pagination", 
            ".page-numbers",
            ".paging",
            "ul.pagination",
            ".pagination-container",
            "[class*='page']"
        ]
        
        total_pages = 1  # Default assumption of single page
        
        # Iterate through potential pagination selectors
        for selector in pagination_selectors:
            try:
                pagination = driver.find_element(By.CSS_SELECTOR, selector)
                if pagination.is_displayed():
                    print(f"   Located pagination using selector: {selector}")
                    
                    # Extract page number links from pagination element
                    page_links = pagination.find_elements(By.TAG_NAME, "a")
                    page_numbers = []
                    
                    for link in page_links:
                        link_text = link.text.strip()
                        if link_text.isdigit():
                            page_numbers.append(int(link_text))
                            print(f"      Detected page: {link_text}")
                    
                    # Determine maximum page number
                    if page_numbers:
                        total_pages = max(page_numbers)
                        print(f"   Total pages identified: {total_pages}")
                        return total_pages
                    
            except NoSuchElementException:
                continue
        
        # Alternative approach: Search for pagination indicators in page text
        page_source = driver.page_source
        
        # Look for common pagination text patterns
        page_patterns = [
            r'page\s+\d+\s+of\s+(\d+)',
            r'(\d+)\s+pages?',
            r'showing\s+\d+-\d+\s+of\s+\d+\s+(?:.*?(\d+)\s+pages?)?',
        ]
        
        for pattern in page_patterns:
            matches = re.findall(pattern, page_source, re.IGNORECASE)
            if matches:
                total_pages = int(matches[0])
                print(f"   Detected {total_pages} pages from text analysis")
                return total_pages
        
        print(f"   Pagination detection inconclusive - defaulting to 21 pages")
        return 21  # Conservative estimate based on typical real estate sites
        
    except Exception as e:
        print(f"Error in pagination detection: {e}")
        return 21

def extract_properties_from_current_page(driver, page_num):
    """
    Extract all property data from the currently loaded page.
    This function parses the HTML structure to identify individual property listings
    and extracts relevant information from each one.
    """
    try:
        print(f"Step 3.{page_num}: Extracting properties from page {page_num}...")
        
        # Ensure page content is fully loaded
        time.sleep(3)
        
        # Parse the page source using BeautifulSoup for easier HTML manipulation
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Define potential selectors for property listing containers
        property_selectors = [
            ".listing-item",        # Primary selector based on site structure
            ".property-item", 
            ".property-listing",
            "[data-id]",           # Elements with data-id attributes
            ".property-card",
            ".listing-card"
        ]
        
        all_properties = []
        
        # Try each selector until we find property listings
        for selector in property_selectors:
            properties = soup.select(selector)
            if properties:
                print(f"   Located {len(properties)} properties using selector: {selector}")
                all_properties = properties
                break
        
        # Handle case where no properties are found
        if not all_properties:
            print(f"   No properties found on page {page_num}")
            return []
        
        # Process each property listing individually
        extracted_properties = []
        
        for i, prop in enumerate(all_properties, 1):
            try:
                # Extract detailed information from each property
                prop_data = extract_property_details(prop, page_num, i)
                if prop_data:
                    extracted_properties.append(prop_data)
                    print(f"   Processed property {i}: {prop_data.get('Title', 'No title')[:40]}...")
            except Exception as e:
                print(f"   Error processing property {i}: {e}")
                continue
        
        print(f"   Page {page_num} summary: {len(extracted_properties)} properties extracted")
        return extracted_properties
        
    except Exception as e:
        print(f"Error extracting properties from page {page_num}: {e}")
        return []

def extract_property_details(property_element, page_num, prop_num):
    """
    Extract detailed information from individual property HTML elements.
    This function targets specific HTML structures to retrieve property attributes
    such as price, location, bedrooms, bathrooms, and other relevant details.
    """
    data = {}
    
    try:
        # Extract property URL using multiple fallback methods
        url = None
        
        # Primary method: Look for main property link container
        main_link = property_element.select_one('a.listing-img-container')
        if main_link and main_link.get('href'):
            url = main_link['href']
        
        # Secondary method: Check title section for property link
        if not url:
            title_link = property_element.select_one('.listing-title h4 a')
            if title_link and title_link.get('href'):
                url = title_link['href']
        
        # Tertiary method: Find any property-related link
        if not url:
            any_link = property_element.select_one('a[href*="/property/"]')
            if any_link and any_link.get('href'):
                url = any_link['href']
        
        # Ensure URL is complete and properly formatted
        if url and not url.startswith('http'):
            if url.startswith('/'):
                url = 'https://tibiaan.com' + url
            else:
                url = 'https://tibiaan.com/' + url
        
        data['URL'] = url
        
        # Extract unique property identifier from data attributes
        data_id = property_element.get('data-id')
        if data_id:
            data['Property_ID'] = data_id
        
        # Extract property title using hierarchical approach
        title = None
        
        # Primary: Title from heading link in listing section
        title_elem = property_element.select_one('.listing-title h4 a')
        if title_elem:
            title = title_elem.get_text(strip=True)
        
        # Secondary: Title from heading element directly
        if not title:
            title_elem = property_element.select_one('.listing-title h4')
            if title_elem:
                title = title_elem.get_text(strip=True)
        
        # Tertiary: Extract from image alt text as fallback
        if not title:
            img = property_element.select_one('img[alt]')
            if img:
                title = img.get('alt', '')
        
        data['Title'] = title if title else f"Property {page_num}-{prop_num}"
        
        # Extract price information from structured price elements
        price = None
        
        # Method 1: Separate price value and currency unit elements
        price_value = property_element.select_one('.priceValue')
        price_unit = property_element.select_one('.priceUnit')
        
        if price_value and price_unit:
            value = price_value.get_text(strip=True)
            unit = price_unit.get_text(strip=True)
            price = f"{value} {unit}"
        
        # Method 2: Price value only (assume OMR currency)
        elif price_value:
            value = price_value.get_text(strip=True)
            price = f"{value} OMR"
        
        # Method 3: Search within general price container
        if not price:
            price_div = property_element.select_one('.listing-price')
            if price_div:
                price_text = price_div.get_text()
                # Use regex to extract numeric price value
                price_match = re.search(r'(\d{1,3}(?:,\d{3})*)', price_text)
                if price_match:
                    price = f"{price_match.group(1)} OMR"
        
        data['Price'] = price
        
        # Extract location information using map marker context
        location = None
        
        # Primary method: Text following map marker icon
        map_marker = property_element.select_one('i.fa.fa-map-marker')
        if map_marker:
            # Navigate to parent element and extract subsequent text
            parent = map_marker.find_parent()
            if parent:
                parent_text = parent.get_text()
                # Split text and identify location portion
                text_parts = parent_text.split('\n')
                for part in text_parts:
                    part = part.strip()
                    if part and part not in ['Details', '']:
                        # Validate location using common Omani area patterns
                        if any(loc_word in part for loc_word in ['Al ', 'Muscat', 'Bausher', 'Ghala', 'Seeb']):
                            location = part
                            break
        
        # Secondary method: Pattern matching for Omani locations
        if not location:
            all_text = property_element.get_text()
            location_patterns = [
                r'(Al\s+[A-Za-z]+(?:\s+[A-Za-z]+)*)',
                r'(Muscat\s+[A-Za-z]+(?:\s+[A-Za-z]+)*)',
                r'(Bausher|Azaiba|Ghala|Seeb|Mabela|Hail|Amrat|Rusayl|Khoudh|Qurum|Maabela|Mawaleh|Khuwair)',
            ]
            
            for pattern in location_patterns:
                matches = re.findall(pattern, all_text, re.IGNORECASE)
                if matches:
                    location = matches[0] if isinstance(matches[0], str) else matches[0][0]
                    break
        
        data['Location'] = location
        
        # Extract bedroom count using icon-based identification
        bedrooms = None
        
        # Method 1: Find list item containing bed icon
        bed_li = property_element.select_one('li:has(span.fa.fa-bed)')
        if bed_li:
            # Extract numeric value from the list item text
            bed_text = bed_li.get_text(strip=True)
            bed_match = re.search(r'(\d+)', bed_text)
            if bed_match:
                bedrooms = bed_match.group(1)
        
        # Method 2: Alternative approach via icon element
        if not bedrooms:
            bed_span = property_element.select_one('span.fa.fa-bed')
            if bed_span:
                # Traverse to parent list item
                bed_li = bed_span.find_parent('li')
                if bed_li:
                    bed_text = bed_li.get_text(strip=True)
                    bed_match = re.search(r'(\d+)', bed_text)
                    if bed_match:
                        bedrooms = bed_match.group(1)
        
        data['Bedrooms'] = bedrooms
        
        # Extract bathroom count using similar icon-based approach
        bathrooms = None
        
        # Method 1: Find list item containing bath icon
        bath_li = property_element.select_one('li:has(span.fa.fa-bath)')
        if bath_li:
            # Extract numeric value from the list item text
            bath_text = bath_li.get_text(strip=True)
            bath_match = re.search(r'(\d+)', bath_text)
            if bath_match:
                bathrooms = bath_match.group(1)
        
        # Method 2: Alternative approach via icon element
        if not bathrooms:
            bath_span = property_element.select_one('span.fa.fa-bath')
            if bath_span:
                # Traverse to parent list item
                bath_li = bath_span.find_parent('li')
                if bath_li:
                    bath_text = bath_li.get_text(strip=True)
                    bath_match = re.search(r'(\d+)', bath_text)
                    if bath_match:
                        bathrooms = bath_match.group(1)
        
        data['Bathrooms'] = bathrooms
        
        # Extract property size/area information
        size = None
        
        # Method 1: Look for area section with hidden input value
        area_li = property_element.select_one('li.areaSection')
        if area_li:
            # Check for hidden input containing area value
            area_input = area_li.select_one('input.oldArea')
            if area_input:
                area_value = area_input.get('value')
                if area_value:
                    size = f"{area_value} sqm"
        
        # Method 2: Pattern matching for area measurements
        if not size:
            all_text = property_element.get_text()
            size_patterns = [
                r'(\d+(?:,\d+)?\s*(?:sq\.?\s*)?m\b)',
                r'(\d+(?:,\d+)?\s*sqm\b)',
                r'(\d+(?:,\d+)?\s*(?:sq\.?\s*)?ft\b)',
                r'(\d+(?:,\d+)?)\s*(?:sq\.?\s*)?(?:m|meter|metre)',
            ]
            
            for pattern in size_patterns:
                matches = re.findall(pattern, all_text, re.IGNORECASE)
                if matches:
                    # Find the largest area measurement (likely main property size)
                    sizes = []
                    for match in matches:
                        size_num = re.sub(r'[^\d,]', '', match).replace(',', '')
                        if size_num.isdigit():
                            sizes.append(int(size_num))
                    
                    if sizes:
                        max_size = max(sizes)
                        if max_size >= 50:  # Filter unreasonably small values
                            size = f"{max_size:,} sqm"
                            break
        
        data['Size'] = size
        
        return data
        
    except Exception as e:
        print(f"      Error extracting data from property {page_num}-{prop_num}: {e}")
        return None

def navigate_to_page(driver, page_num):
    """
    Navigate to a specific page number in the pagination system.
    This function attempts multiple URL patterns commonly used by real estate websites
    to ensure successful navigation across different page structures.
    """
    try:
        print(f"Step 4.{page_num}: Navigating to page {page_num}...")
        
        # Method 1: Try clicking on page number link
        try:
            page_link = driver.find_element(By.XPATH, f"//a[text()='{page_num}']")
            if page_link.is_displayed():
                page_link.click()
                time.sleep(3)
                return True
        except NoSuchElementException:
            pass
        
        # Method 2: Try various URL pagination patterns
        current_url = driver.current_url
        base_url = current_url.split('?')[0]
        
        # Common pagination URL structures used by websites
        url_patterns = [
            f"{base_url}?page={page_num}",
            f"{base_url}?p={page_num}",
            f"{base_url}/page/{page_num}",
            f"{base_url}?paged={page_num}",
            f"{current_url}&page={page_num}" if '?' in current_url else f"{current_url}?page={page_num}"
        ]
        
        # Attempt each URL pattern until one works
        for url in url_patterns:
            try:
                print(f"   Attempting URL pattern: {url}")
                driver.get(url)
                time.sleep(3)
                
                # Verify successful page load by checking for property content
                properties = driver.find_elements(By.CSS_SELECTOR, ".listing-item, .property-item, [data-id]")
                if len(properties) > 0:
                    print(f"   Successfully navigated to page {page_num}")
                    return True
            except Exception as e:
                continue
        
        print(f"   Navigation to page {page_num} unsuccessful")
        return False
        
    except Exception as e:
        print(f"Error navigating to page {page_num}: {e}")
        return False

def scrape_all_sale_properties():
    """
    Main orchestration function that coordinates the entire scraping process.
    This function manages the workflow from initial setup through data collection
    and final output generation, ensuring all sale properties are systematically collected.
    """
    print("Tibiaan Properties Web Scraper - Sale Properties Collection")
    print("=" * 65)
    
    # Initialize the web driver
    driver = setup_driver()
    if not driver:
        return
    
    all_sale_properties = []
    
    try:
        # Phase 1: Load initial page and configure filters
        print("Phase 1: Loading homepage and configuring filters...")
        driver.get("https://tibiaan.com/")
        time.sleep(5)
        
        # Phase 2: Select sale filter to focus on sale properties only
        sale_filter_selected = select_sale_filter(driver)
        if sale_filter_selected:
            print("Sale filter successfully applied")
        else:
            print("Warning: Sale filter could not be applied, proceeding with all properties")
        
        # Phase 3: Determine pagination structure
        total_pages = detect_pagination(driver)
        print(f"Pagination analysis complete: {total_pages} pages identified")
        
        # Phase 4: Systematic data collection across all pages
        for page_num in range(1, total_pages + 1):
            try:
                print(f"\nProcessing page {page_num} of {total_pages}")
                print("-" * 45)
                
                # Navigate to specific page (skip navigation for first page)
                if page_num > 1:
                    if not navigate_to_page(driver, page_num):
                        print(f"Skipping page {page_num} due to navigation failure")
                        continue
                
                # Extract property data from current page
                page_properties = extract_properties_from_current_page(driver, page_num)
                
                if page_properties:
                    all_sale_properties.extend(page_properties)
                    print(f"Page {page_num} complete: {len(page_properties)} properties collected")
                    print(f"Running total: {len(all_sale_properties)} properties")
                else:
                    print(f"No properties found on page {page_num}")
                
                # Respectful delay between page requests
                time.sleep(2)
                
            except Exception as e:
                print(f"Error processing page {page_num}: {e}")
                continue
        
        # Phase 5: Data output and summary generation
        if all_sale_properties:
            filename = 'tibiaan_sale_properties_final.csv'
            
            # Write collected data to CSV file
            with open(filename, 'w', newline='', encoding='utf-8') as f:
                fieldnames = ['Property_ID', 'Title', 'Location', 'Price', 'Size', 'Bedrooms', 'Bathrooms', 'URL']
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(all_sale_properties)
            
            print(f"\nData collection completed successfully")
            print(f"Total properties scraped: {len(all_sale_properties)}")
            print(f"Output file: {filename}")
            
            # Generate summary statistics
            with_prices = sum(1 for p in all_sale_properties if p.get('Price'))
            with_locations = sum(1 for p in all_sale_properties if p.get('Location'))
            with_bedrooms = sum(1 for p in all_sale_properties if p.get('Bedrooms'))
            
            print(f"\nData quality summary:")
            print(f"   Properties with price information: {with_prices}")
            print(f"   Properties with location data: {with_locations}")
            print(f"   Properties with bedroom details: {with_bedrooms}")
            print(f"   Pages successfully processed: {total_pages}")
            
            # Display sample of collected data for verification
            print(f"\nSample of collected properties:")
            for i, prop in enumerate(all_sale_properties[:5], 1):
                print(f"\n{i}. {prop.get('Title', 'No Title')}")
                print(f"   Price: {prop.get('Price', 'Not specified')}")
                print(f"   Location: {prop.get('Location', 'Not specified')}")
                print(f"   Size: {prop.get('Size', 'Not specified')}")
                print(f"   Bedrooms: {prop.get('Bedrooms', 'Not specified')}")
                print(f"   Bathrooms: {prop.get('Bathrooms', 'Not specified')}")
                print(f"   URL: {prop.get('URL', '')}")
            
        else:
            print("No sale properties were successfully collected")
    
    except Exception as e:
        print(f"Critical error in main process: {e}")
    
    finally:
        # Ensure proper cleanup of browser resources
        driver.quit()

if __name__ == "__main__":
    # Execute the main scraping function
    scrape_all_sale_properties()

Tibiaan Properties Web Scraper - Sale Properties Collection
Phase 1: Loading homepage and configuring filters...
Step 1: Locating and selecting the Sale filter...
   Searching for dropdown menu with Sale option...
      Found dropdown option: ''
      Found dropdown option: ''
      Found dropdown option: ''
      Found dropdown option: ''
      Found dropdown option: ''
      Found dropdown option: ''
      Found dropdown option: ''
      Found dropdown option: ''
      Found dropdown option: 'For Rent'
      Found dropdown option: 'For Sale'
   Successfully located Sale option: 'For Sale'
   Clicking submit button: ''
Sale filter successfully applied
Step 2: Analyzing pagination structure...
   Located pagination using selector: .pagination
      Detected page: 1
      Detected page: 2
      Detected page: 3
      Detected page: 17
      Detected page: 18
      Detected page: 19
   Total pages identified: 19
Pagination analysis complete: 19 pages identified

Processing page 1 of 19
-