# Real Estate Data Collection and Cleaning Project

### for https://vistaoman.com/ 

#### Project Objective
#### This project is designed to simulate a real-world data science task. You will scrape property data from two real estate websites, clean and integrate the data, engineer features, and frame a predictive modeling challenge based on pricing.


In [23]:
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import time
import json
import re
from urllib.parse import urljoin, urlparse
import logging
from datetime import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("All libraries imported successfully, we are good to go!")

All libraries imported successfully, we are good to go!


In [74]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
import re
import requests
from bs4 import BeautifulSoup

def comprehensive_property_scraper():
    """
    Advanced property scraper using multiple extraction methods
    for maximum data retrieval from Vista Oman real estate listings
    """
    
    print("Initializing comprehensive property scraper...")
    print("Target: Vista Oman property listings - Sale category")
    
    # Configure Chrome options for anti-detection
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
    
    driver = webdriver.Chrome(options=options)
    all_properties = []
    
    try:
        print("Navigating to Vista Oman sale listings...")
        driver.get("https://vistaoman.com/offer-type/sale/")
        time.sleep(5)
        
        # Handle Cloudflare protection if present
        if "cloudflare" in driver.page_source.lower():
            print("Cloudflare protection detected. Please complete verification manually.")
            input("Press ENTER after completing verification: ")
        
        print("Method 1: Automated 'Load More' button clicking")
        
        # Method 1: Systematic Load More button interaction
        # This approach attempts to load all available properties by repeatedly clicking pagination
        for attempt in range(100):
            print(f"Load attempt {attempt + 1}/100")
            
            # Count current properties before attempting to load more
            current_props = len(driver.find_elements(By.CSS_SELECTOR, "article"))
            print(f"   Current property count: {current_props}")
            
            # Define multiple selector patterns for Load More buttons
            # Different sites use different button implementations
            load_more_selectors = [
                "button:contains('LOAD MORE')",
                "button:contains('Load More')", 
                "button:contains('load more')",
                "a:contains('LOAD MORE')",
                "a:contains('Load More')",
                ".mdl-button--lg",
                "button.mdl-button",
                ".mh-search__more",
                "[class*='load']",
                "[class*='more']",
                "button[class*='primary']",
                "input[value*='more']"
            ]
            
            clicked = False
            # Iterate through each selector pattern to find clickable element
            for selector in load_more_selectors:
                try:
                    # Handle pseudo-selectors containing text content
                    if ":contains" in selector:
                        text = selector.split("'")[1]
                        xpath = f"//button[contains(text(), '{text}')] | //a[contains(text(), '{text}')]"
                        buttons = driver.find_elements(By.XPATH, xpath)
                    else:
                        buttons = driver.find_elements(By.CSS_SELECTOR, selector)
                    
                    # Attempt to click each found button
                    for button in buttons:
                        try:
                            if button.is_displayed():
                                # Scroll element into view before clicking
                                driver.execute_script("arguments[0].scrollIntoView();", button)
                                time.sleep(0.5)
                                
                                # Try standard click first, fallback to JavaScript click
                                try:
                                    button.click()
                                    clicked = True
                                    print(f"   Successfully clicked using {selector}")
                                    break
                                except:
                                    # JavaScript click as fallback
                                    try:
                                        driver.execute_script("arguments[0].click();", button)
                                        clicked = True
                                        print(f"   Successfully JS clicked using {selector}")
                                        break
                                    except:
                                        continue
                        except:
                            continue
                    if clicked:
                        break
                except:
                    continue
            
            # If no button found, try alternative navigation methods
            if not clicked:
                print(f"   No clickable button found - attempting alternative methods...")
                
                # Method 1a: Keyboard navigation approach
                # Some sites respond to keyboard navigation better than mouse clicks
                try:
                    body = driver.find_element(By.TAG_NAME, "body")
                    body.send_keys(Keys.END)  # Navigate to page bottom
                    time.sleep(1)
                    body.send_keys(Keys.TAB)  # Tab to next focusable element
                    body.send_keys(Keys.ENTER)  # Activate focused element
                    print("   Attempted keyboard navigation method")
                except:
                    pass
                
                # Method 1b: Direct JavaScript execution
                # Bypass Selenium limitations with direct DOM manipulation
                try:
                    driver.execute_script("""
                        var buttons = document.querySelectorAll('button, a, input');
                        for(var i = 0; i < buttons.length; i++) {
                            var text = buttons[i].textContent.toLowerCase();
                            if(text.includes('load') || text.includes('more')) {
                                buttons[i].click();
                                break;
                            }
                        }
                    """)
                    print("   Attempted direct JavaScript method")
                except:
                    pass
            
            # Wait for page response and verify if new content loaded
            time.sleep(3)
            new_props = len(driver.find_elements(By.CSS_SELECTOR, "article"))
            
            if new_props > current_props:
                print(f"   Success! Properties increased: {current_props} -> {new_props}")
            else:
                print(f"   No change detected: {new_props}")
                
                # If no progress for several attempts, switch methods
                if attempt >= 2:
                    print("Switching to Method 2: Direct pagination URLs")
                    break
        
        # Method 2: Direct URL pagination brute force
        # When Load More fails, try accessing pagination URLs directly
        print("\nMethod 2: Direct pagination URL access")
        
        # Common URL patterns for pagination
        base_urls = [
            "https://vistaoman.com/offer-type/sale/page/{}",
            "https://vistaoman.com/offer-type/sale/?page={}",
            "https://vistaoman.com/offer-type/sale/?p={}",
            "https://vistaoman.com/Properties-for-sale-rent/page/{}/?offer-type=sale"
        ]
        
        # Attempt to access pages 2-50 using different URL patterns
        for page_num in range(2, 50):
            print(f"Testing page {page_num}")
            
            for base_url in base_urls:
                try:
                    url = base_url.format(page_num)
                    print(f"   Accessing: {url}")
                    
                    driver.get(url)
                    time.sleep(3)
                    
                    # Verify page contains property listings
                    page_props = driver.find_elements(By.CSS_SELECTOR, "article")
                    if len(page_props) > 5:  # Threshold for valid property page
                        print(f"   Found {len(page_props)} properties on page {page_num}")
                        
                        # Extract property data from current page
                        for prop in page_props:
                            try:
                                text = prop.text.strip()
                                # Basic validation: check for minimum content and price indicator
                                if len(text) > 50 and 'OMR' in text:
                                    prop_data = extract_property_data(text, len(all_properties) + 1)
                                    if prop_data:
                                        all_properties.append(prop_data)
                            except:
                                continue
                        
                        print(f"   Total properties collected: {len(all_properties)}")
                        break
                        
                except Exception as e:
                    continue
            
            # Stop if target threshold reached
            if len(all_properties) >= 300:
                print(f"Target reached! Collected {len(all_properties)} properties")
                break
        
        # Method 3: API endpoint discovery
        # Attempt to find and use internal API endpoints
        if len(all_properties) < 100:
            print("\nMethod 3: API endpoint discovery")
            try:
                # Return to main page to analyze network traffic
                driver.get("https://vistaoman.com/offer-type/sale/")
                time.sleep(5)
                
                # Analyze browser performance logs for API calls
                logs = driver.get_log('performance')
                for log in logs:
                    message = log.get('message', {})
                    if 'Network.response' in str(message):
                        print(f"   Network request detected: {message}")
                        
            except:
                print("   API discovery method failed")
        
        print(f"\nScraping operation completed")
        print(f"Total properties extracted: {len(all_properties)}")
        
        # Method 4: Emergency fallback extraction
        # If all methods fail, perform basic text extraction from available content
        if len(all_properties) == 0:
            print("Method 4: Emergency content extraction")
            
            # Navigate back to main page for content extraction
            driver.get("https://vistaoman.com/offer-type/sale/")
            time.sleep(5)
            
            # Parse entire page source for property-related content
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Extract all text containing OMR prices using regex
            omr_matches = re.findall(r'(.*?)OMR\s*([\d,]+)(.*?)(?=OMR|$)', page_source, re.DOTALL | re.IGNORECASE)
            
            # Process each price match as potential property
            for i, (before, price, after) in enumerate(omr_matches[:100]):
                try:
                    # Create context around price for property data extraction
                    context = (before[-200:] + f"OMR {price}" + after[:200]).strip()
                    
                    if len(context) > 50:
                        prop_data = extract_property_data(context, i + 1)
                        if prop_data:
                            all_properties.append(prop_data)
                except:
                    continue
            
            print(f"Emergency extraction completed: {len(all_properties)} properties")
        
        # Save results regardless of extraction method success
        if all_properties:
            df = pd.DataFrame(all_properties)
            filename = f"vista_oman_properties_{len(all_properties)}_listings.csv"
            df.to_csv(filename, index=False, encoding='utf-8')
            print(f"Data saved to: {filename}")
            
            # Display extraction summary
            print(f"\nExtraction Summary:")
            print(f"   Total properties: {len(all_properties)}")
            print(f"   Properties with prices: {len([p for p in all_properties if 'OMR' in str(p.get('price', ''))])}")
            
            # Show sample of extracted data
            for i in range(min(3, len(all_properties))):
                prop = all_properties[i]
                print(f"\n--- Sample Property {i+1} ---")
                print(f"   Title: {prop.get('title', 'Not available')}")
                print(f"   Price: {prop.get('price', 'Not available')}")
                print(f"   Location: {prop.get('location', 'Not available')}")
            
            return df
        else:
            print("Extraction failed: No properties found")
            return None
            
    except Exception as e:
        print(f"Critical error occurred: {e}")
        return None
        
    finally:
        # Ensure browser cleanup regardless of success/failure
        try:
            driver.quit()
        except:
            pass

def extract_property_data(text, prop_id):
    """
    Extract structured property data from raw text content
    
    Args:
        text (str): Raw text containing property information
        prop_id (int): Unique identifier for the property
    
    Returns:
        dict: Structured property data or None if extraction fails
    """
    try:
        # Initialize property data structure
        data = {
            'id': prop_id,
            'title': 'Not available',
            'price': 'Not available', 
            'location': 'Not available',
            'bedrooms': 'Not available',
            'raw_text': text[:300]  # Store first 300 chars for debugging
        }
        
        # Extract price using regex pattern for OMR currency
        price_match = re.search(r'OMR\s*([\d,]+)', text, re.IGNORECASE)
        if price_match:
            data['price'] = f"OMR {price_match.group(1)}"
        
        # Extract location using common Oman area names
        location_match = re.search(r'(Al \w+|Muscat|Qurm|Khuwair|Bawshar|Madinat|Salalah|Ruwi)', text, re.IGNORECASE)
        if location_match:
            data['location'] = location_match.group(1)
        
        # Extract bedroom count using common patterns
        bed_match = re.search(r'(\d+)\s*(?:BR|bed|bedroom)', text, re.IGNORECASE)
        if bed_match:
            data['bedrooms'] = bed_match.group(1)
        
        # Extract title from the longest suitable line
        # Look for meaningful text that could serve as property title
        lines = text.split('\n')
        for line in lines:
            line = line.strip()
            # Filter for lines that could be titles (length and content criteria)
            if len(line) > 15 and len(line) < 150 and 'OMR' not in line:
                data['title'] = line
                break
        
        # Return data only if price was successfully extracted
        return data if data['price'] != 'Not available' else None
        
    except:
        return None

# Main execution block
if __name__ == "__main__":
    print("=== Vista Oman Property Scraper ===")
    print("Advanced multi-method property data extraction system")
    print("Target: Sale listings from vistaoman.com")
    print("========================================")
    
    # Execute the comprehensive scraping operation
    result = comprehensive_property_scraper()
    
    if result is not None:
        print("\n=== Scraping Operation Completed Successfully ===")
        print("Property data has been extracted and saved to CSV file")
        print("Check the generated CSV file for complete results")
    else:
        print("\n=== Scraping Operation Failed ===")
        print("Unable to extract property data - website may have restrictions")
        print("Consider reviewing target website structure or anti-bot measures")

=== Vista Oman Property Scraper ===
Advanced multi-method property data extraction system
Target: Sale listings from vistaoman.com
Initializing comprehensive property scraper...
Target: Vista Oman property listings - Sale category
Navigating to Vista Oman sale listings...
Method 1: Automated 'Load More' button clicking
Load attempt 1/100
   Current property count: 10
   Successfully clicked using .mdl-button--lg
   Success! Properties increased: 10 -> 20
Load attempt 2/100
   Current property count: 20
   Successfully clicked using button.mdl-button
   No change detected: 0
Load attempt 3/100
   Current property count: 0
   Successfully clicked using button.mdl-button
   Success! Properties increased: 0 -> 10
Load attempt 4/100
   Current property count: 10
   Successfully clicked using button.mdl-button
   No change detected: 10
Switching to Method 2: Direct pagination URLs

Method 2: Direct pagination URL access
Testing page 2
   Accessing: https://vistaoman.com/offer-type/sale/page/