## üó∫Ô∏è Extract Address Information from OLX Listings

Let's scrape the OLX pages again to check if there's address information available on the listing pages.

In [None]:
# Step 1: Load URLs from the CSV file
print("=" * 80)
print("LOADING URLS FROM CSV FILE")
print("=" * 80)

csv_path = '../data/sarajevo_flats_merged_olx_cleaned.csv'

# Load the CSV to get URLs
df_urls = pd.read_csv(csv_path)

if 'url' in df_urls.columns:
    urls_to_check = df_urls['url'].tolist()
    print(f"\n‚úÖ Loaded {len(urls_to_check)} URLs from CSV file")
    print(f"\nSample URLs (first 5):")
    for i, url in enumerate(urls_to_check[:5], 1):
        print(f"  {i}. {url}")
else:
    print("\n‚ùå No 'url' column found in CSV file")
    urls_to_check = []

print("\n" + "=" * 80)

: 

In [None]:
# Step 2: Create enhanced scraper to extract address information
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service

def extract_address_from_olx(url, driver):
    """
    Extract address information from an OLX listing page.
    Looks for various address-related fields.
    """
    try:
        driver.get(url)
        time.sleep(2)  # Wait for page to load
        
        html = driver.page_source
        soup = BeautifulSoup(html, "lxml")
        
        address_info = {
            'url': url,
            'address': None,
            'location_details': None,
            'map_available': False
        }
        
        # Method 1: Look for address in listing details
        # Check for "Adresa" or "Lokacija" labels
        address_labels = ['Adresa', 'Lokacija', 'Ulica', 'Mjesto']
        
        for label in address_labels:
            # Look for label followed by value
            label_element = soup.find(string=lambda text: text and label in text)
            if label_element:
                # Try to find the value in nearby elements
                parent = label_element.find_parent()
                if parent:
                    # Look for sibling or child elements
                    value_element = parent.find_next('h4') or parent.find_next('span') or parent.find_next('div')
                    if value_element:
                        address_text = clean_text(value_element.get_text())
                        if address_text and len(address_text) > 3:
                            address_info['address'] = address_text
                            address_info['location_details'] = f"Found via '{label}' label"
                            break
        
        # Method 2: Look for location/municipality information more broadly
        if not address_info['address']:
            # Check for location pill/button
            location_pill = soup.find("div", class_="btn-pill city")
            if location_pill:
                for svg in location_pill.find_all("svg"):
                    svg.decompose()
                location_text = clean_text(location_pill.get_text())
                if location_text:
                    address_info['address'] = location_text
                    address_info['location_details'] = "Found in location pill"
        
        # Method 3: Check for map presence (indicates address might be available)
        map_elements = soup.find_all(['iframe', 'div'], attrs={'class': lambda x: x and 'map' in x.lower()})
        if map_elements:
            address_info['map_available'] = True
        
        # Method 4: Look in meta tags
        if not address_info['address']:
            meta_address = soup.find('meta', attrs={'property': 'og:street-address'})
            if meta_address and meta_address.get('content'):
                address_info['address'] = meta_address.get('content')
                address_info['location_details'] = "Found in meta tags"
        
        return address_info
        
    except Exception as e:
        print(f"  ‚ùå Error processing {url}: {e}")
        return {
            'url': url,
            'address': None,
            'location_details': f"Error: {str(e)}",
            'map_available': False
        }

print("=" * 80)
print("ADDRESS EXTRACTION FUNCTION READY")
print("=" * 80)
print("\n‚úÖ Created function: extract_address_from_olx()")
print("   This function will:")
print("   - Load each OLX listing page")
print("   - Search for address/location fields")
print("   - Check for map availability")
print("   - Return structured address information")
print("\n" + "=" * 80)