In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re 
import time

# 1. Define the base URL of the website
base_url = "https://hilalprp.com.om/"
# Initialize the current page URL to ONLY for-sale properties
current_page_url = "https://hilalprp.com.om/properties-search/?status=for-sale"

# List to store all scraped property data
all_properties_data = []

print(f"Starting to scrape ONLY FOR-SALE properties from: {current_page_url}")

# Function to verify if property is actually for sale
def is_property_for_sale(prop_soup):
    """Check if the property page confirms it's for sale"""
    page_text = prop_soup.get_text().lower()
    
    # Look for "for sale" indicators
    sale_indicators = ['for sale', 'forsale', 'sale price', 'price omr']
    rent_indicators = ['for rent', 'forrent', 'rent price', 'monthly rent', 'rental']
    
    has_sale = any(indicator in page_text for indicator in sale_indicators)
    has_rent = any(indicator in page_text for indicator in rent_indicators)
    
    # If it has rent indicators and no sale indicators, it's a rental
    if has_rent and not has_sale:
        return False
    
    return True  # Default to assuming it's for sale if from sale page

# Loop through pages - FIXED with correct URL format from start
page_counter = 1
max_pages = 7  # Exactly 7 pages as you mentioned

# Define the correct URL pattern for each page
def get_page_url(page_num):
    if page_num == 1:
        return "https://hilalprp.com.om/properties-search/?status=for-sale"
    else:
        return f"https://hilalprp.com.om/properties-search/page/{page_num}/?status=for-sale"

while page_counter <= max_pages:
    # Get the correct URL for this page
    current_page_url = get_page_url(page_counter)
    
    print(f"Scraping page {page_counter}: {current_page_url}")
    try:
        # Add headers to mimic a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(current_page_url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {current_page_url}: {e}")
        break # Exit loop on request error
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Check if this page shows FOR-SALE properties (verify we're on the right page)
    page_text = soup.get_text()
    if "1 to 10 out of 63" in page_text:
        print(f"✅ Confirmed: Page {page_counter} shows '1 to 10 out of 63' - correct for-sale section")
    elif "properties" in page_text.lower() and "for sale" in page_text.lower():
        print(f"✅ Confirmed: Page {page_counter} contains for-sale property listings")
    else:
        print(f"⚠️ Warning: Page {page_counter} might not be the correct for-sale section")
    
    # Find property links on this page - ONLY from for-sale listings
    property_links = soup.find_all('a', href=True)
    property_urls = []
    
    # Extract property URLs - be more selective
    for link in property_links:
        href = link.get('href', '')
        if '/property/' in href:
            # Additional check: make sure the link is from a property listing context
            parent_context = str(link.parent) if link.parent else ""
            
            # Skip if the link seems to be from navigation or unrelated sections
            if any(skip_term in parent_context.lower() for skip_term in ['nav', 'menu', 'footer', 'header']):
                continue
                
            full_url = requests.compat.urljoin(base_url, href)
            if full_url not in property_urls:  # Avoid duplicates within same page
                property_urls.append(full_url)
    
    if not property_urls:
        print(f"⚠️ No property links found on page {page_counter}.")
        # Still try to go to next page in case this page had issues
    else:
        print(f"Found {len(property_urls)} properties on page {page_counter}")
    
    # Process properties on this page
    for property_url in property_urls:
        # Simple duplicate check - only check the last part of URL to avoid false positives
        property_id = property_url.split('/')[-2] if property_url.endswith('/') else property_url.split('/')[-1]
        already_processed = any(prop['URL_Page'].endswith(property_id) for prop in all_properties_data)
        
        if already_processed:
            print(f"  Skipping duplicate: {property_id}")
            continue
            
        try:
            print(f"  Visiting: {property_url}")
            prop_response = requests.get(property_url, headers=headers)
            prop_response.raise_for_status()
            prop_soup = BeautifulSoup(prop_response.text, 'html.parser')
            
            # IMPORTANT: Verify this property is actually for sale
            if not is_property_for_sale(prop_soup):
                print(f"    ❌ Skipping: Property appears to be for rent, not sale")
                continue
            
            # Extract property data (similar to tutor's title, price, rating extraction)
            
            # Title - look for main heading
            title = None
            title_selectors = ['h1.rh_page_title', 'h1', '.rh_page_property_title h1']
            for selector in title_selectors:
                title_elem = prop_soup.select_one(selector)
                if title_elem:
                    title = title_elem.get_text(strip=True)
                    break
            
            # Price - look for price element
            price = None
            price_elem = prop_soup.select_one('p.price, .price')
            if price_elem:
                price = price_elem.get_text(strip=True)
            
            # Bedrooms - look for bedroom text and associated numbers
            bedrooms = None
            page_text = prop_soup.get_text()
            
            # Method 1: Look for patterns like "Bedrooms 3" or "3 Bedrooms"
            bedroom_patterns = [
                r'Bedrooms\s+(\d+)',
                r'(\d+)\s+Bedrooms?',
                r'Bedroom[s]?\s*:?\s*(\d+)',
                r'(\d+)\s*-?\s*bedroom'
            ]
            
            for pattern in bedroom_patterns:
                match = re.search(pattern, page_text, re.IGNORECASE)
                if match:
                    bedrooms = match.group(1)
                    break
            
            # Method 2: Look in meta sections with specific class names
            if not bedrooms:
                bedroom_meta = prop_soup.find('div', class_=re.compile(r'prop_bedrooms|bedroom'))
                if bedroom_meta:
                    figure = bedroom_meta.find(['span', 'div'], class_='figure')
                    if figure:
                        bedrooms = figure.get_text(strip=True)
            
            # Bathrooms - similar approach to bedrooms
            bathrooms = None
            
            # Method 1: Look for patterns like "Bathrooms 3" or "3 Bathrooms"
            bathroom_patterns = [
                r'Bathrooms\s+(\d+)',
                r'(\d+)\s+Bathrooms?',
                r'Bathroom[s]?\s*:?\s*(\d+)',
                r'(\d+)\s*-?\s*bathroom'
            ]
            
            for pattern in bathroom_patterns:
                match = re.search(pattern, page_text, re.IGNORECASE)
                if match:
                    bathrooms = match.group(1)
                    break
            
            # Method 2: Look in meta sections
            if not bathrooms:
                bathroom_meta = prop_soup.find('div', class_=re.compile(r'prop_bathrooms|bathroom'))
                if bathroom_meta:
                    figure = bathroom_meta.find(['span', 'div'], class_='figure')
                    if figure:
                        bathrooms = figure.get_text(strip=True)
            
            # Area - look for area patterns in text
            area = None
            
            # Look for area patterns like "380 SQM" or "Area: 250"
            area_patterns = [
                r'(\d+)\s*(?:sq\.?\s*m|sqm|SQM)',
                r'Area\s*:?\s*(\d+)',
                r'Size\s*:?\s*(\d+)',
                r'(\d+)\s*square\s*meter'
            ]
            
            for pattern in area_patterns:
                match = re.search(pattern, page_text, re.IGNORECASE)
                if match:
                    area = match.group(1)
                    break
            
            # Location - extract from breadcrumb navigation (FIXED to get "Bausher")
            location = None
            
            # Method 1: Look for breadcrumb location 
            breadcrumb_selectors = [
                '.page-breadcrumbs a[href*="property-city"]',  # Direct breadcrumb link
                '.property-breadcrumbs a',
                'nav.property-breadcrumbs a',
                '.page-breadcrumbs-modern a'
            ]
            
            for selector in breadcrumb_selectors:
                breadcrumb_links = prop_soup.select(selector)
                for link in breadcrumb_links:
                    href = link.get('href', '')
                    # Look for city/location links in breadcrumbs
                    if 'property-city' in href or 'bausher' in href.lower():
                        location = link.get_text(strip=True)
                        break
                if location:
                    break
            
            # Method 2: Extract from breadcrumb text directly
            if not location:
                breadcrumb_nav = prop_soup.select_one('nav.property-breadcrumbs, .page-breadcrumbs')
                if breadcrumb_nav:
                    # Look for text that's not "Home" or property title
                    breadcrumb_text = breadcrumb_nav.get_text()
                    parts = breadcrumb_text.split('>')
                    for part in parts:
                        part = part.strip()
                        # Skip common breadcrumb parts, keep location names
                        if (part and part not in ['Home', 'Property', 'Properties'] and 
                            len(part) > 2 and not part.startswith('http')):
                            location = part
                            break
            
            # Method 3: Look for URL pattern to extract location
            if not location:
                # Extract from current URL if it contains location info
                url_parts = property_url.split('/')
                for part in url_parts:
                    # Common Oman location names
                    if any(loc.lower() in part.lower() for loc in 
                          ['bausher', 'al-mouj', 'azaiba', 'qurum', 'mawaleh', 'hail']):
                        location = part.replace('-', ' ').title()
                        break
            
            # Method 4: Fallback - extract from title if location keywords present
            if not location and title:
                location_match = re.search(r'IN\s*\(([^)]+)\)', title, re.IGNORECASE)
                if location_match:
                    location = location_match.group(1).strip()
            
            # Property Type - determine from title
            property_type = None
            if title:
                title_lower = title.lower()
                if any(word in title_lower for word in ['villa', 'house']):
                    property_type = 'Villa'
                elif any(word in title_lower for word in ['apartment', 'flat']):
                    property_type = 'Apartment'
                elif 'townhouse' in title_lower:
                    property_type = 'Townhouse'
                elif 'duplex' in title_lower:
                    property_type = 'Duplex'
                else:
                    property_type = 'Other'
            
            # Create property data dictionary 
            property_data = {
                'Title': title,
                'Price': price, 
                'Bedrooms': bedrooms,
                'Bathrooms': bathrooms,
                'Area': area,
                'Location': location,
                'Property_Type': property_type,
                'URL_Page': property_url
            }
            
            all_properties_data.append(property_data)
            print(f"    Extracted: {title}")
            
        except Exception as e:
            print(f"    Error processing {property_url}: {e}")
            continue
        
        # Add delay between property requests
        time.sleep(1)

    # Move to next page (FIXED to ensure exactly 7 pages)
    page_counter += 1
    
    if page_counter <= max_pages:
        # Construct next page URL manually to ensure we get all pages
        if page_counter == 2:
            # First pagination - from base URL to page 2
            current_page_url = "https://hilalprp.com.om/properties-search/?status=for-sale&page=2"
        else:
            # Subsequent pages
            current_page_url = f"https://hilalprp.com.om/properties-search/?status=for-sale&page={page_counter}"
        
        print(f"Moving to page {page_counter}: {current_page_url}")
        time.sleep(2)  # Pause between pages
    else:
        print(f"Completed all {max_pages} pages.")
        break

# Create DataFrame
if all_properties_data:
    df = pd.DataFrame(all_properties_data)
    print(f"\nScraping completed! Found {len(df)} properties.")
    print("\nDataFrame shape:", df.shape)
    print("\nFirst few rows:")
    print(df.head())
    
    # Save to CSV
    df.to_csv('hilal_properties.csv', index=False)
    print("\nData saved to 'hilal_properties.csv'")
else:
    print("No property data was scraped.")

Starting to scrape ONLY FOR-SALE properties from: https://hilalprp.com.om/properties-search/?status=for-sale
Scraping page 1: https://hilalprp.com.om/properties-search/?status=for-sale
✅ Confirmed: Page 1 contains for-sale property listings
Found 10 properties on page 1
  Visiting: https://hilalprp.com.om/property/3-bedroom-apartment-35/
    Extracted: 3-BEDROOM APARTMENT
  Visiting: https://hilalprp.com.om/property/3-bedroom-villa-3/
    Extracted: 3-BEDROOM VILLA
  Visiting: https://hilalprp.com.om/property/6-bedroom-twin-villa-13/
    Extracted: 6-BEDROOM TWIN VILLA
  Visiting: https://hilalprp.com.om/property/7-bedroom-detached-villa-9/
    Extracted: 7-BEDROOM DETACHED VILLA
  Visiting: https://hilalprp.com.om/property/4-bedroom-detached-villa-19/
    Extracted: 4-BEDROOM DETACHED VILLA
  Visiting: https://hilalprp.com.om/property/5-bedroom-twin-villa-21/
    Extracted: 5-BEDROOM TWIN VILLA
  Visiting: https://hilalprp.com.om/property/7-bedroom-twin-villa-3/
    Extracted: 7-BEDRO

In [5]:
df

Unnamed: 0,Title,Price,Bedrooms,Bathrooms,Area,Location,Property_Type,URL_Page
0,3-BEDROOM APARTMENT,"OMR45,000",3,3,2,Bausher,Apartment,https://hilalprp.com.om/property/3-bedroom-apa...
1,3-BEDROOM VILLA,"OMR290,000",3,4,35,Al Mawaleh,Villa,https://hilalprp.com.om/property/3-bedroom-vil...
2,6-BEDROOM TWIN VILLA,"OMR180,000",6,7,85,Bausher,Villa,https://hilalprp.com.om/property/6-bedroom-twi...
3,7-BEDROOM DETACHED VILLA,"OMR300,000",7,9,130,Al Ansab,Villa,https://hilalprp.com.om/property/7-bedroom-det...
4,4-BEDROOM DETACHED VILLA,"OMR80,000",4,6,300,Al Hail,Villa,https://hilalprp.com.om/property/4-bedroom-det...
...,...,...,...,...,...,...,...,...
58,6-BEDROOM DETACHED VILLA,"OMR300,000",6,5,180,Al Hail,Villa,https://hilalprp.com.om/property/6-bedroom-det...
59,3 BEDROOM TOWNHOUSE,"OMR80,000",3,3,290,Al Khoudh,Villa,https://hilalprp.com.om/property/3-bedroom-tow...
60,8 BEDROOM DETACHED VILLA IN (MAWALLEH),"OMR-320,000OMR290,000",6,8,670,Al Mawaleh,Villa,https://hilalprp.com.om/property/5-bedroom-det...
61,7 BEDROOM DETACHED VILLA IN (AL KHUWAIR),"OMR85,000",7,8,130,Al Khuwair,Villa,https://hilalprp.com.om/property/7-bedroom-det...


In [6]:
# Save with custom filename
df.to_csv('my_hilal_properties_data.csv', index=False)
print("Data saved to 'my_hilal_properties_data.csv'")

Data saved to 'my_hilal_properties_data.csv'


In [7]:
from IPython.display import FileLink
FileLink('hilal_properties.csv')

In [8]:
# Re-save with explicit UTF-8 encoding and proper CSV format
df.to_csv('hilal_properties_fixed.csv', index=False, encoding='utf-8-sig', sep=',')
print("Fixed CSV saved as 'hilal_properties_fixed.csv'")

Fixed CSV saved as 'hilal_properties_fixed.csv'


In [9]:
from IPython.display import FileLink
FileLink('hilal_properties_fixed.csv')