## üó∫Ô∏è Extract Address Information from OLX Listings

Let's scrape the OLX pages again to check if there's address information available on the listing pages.

In [1]:
# Step 1: Load URLs from the CSV file
print("=" * 80)
print("LOADING URLS FROM CSV FILE")
print("=" * 80)

import pandas as pd

csv_path = '../data/sarajevo_flats_olx.csv'

# Load the CSV to get URLs
df_urls = pd.read_csv(csv_path)

if 'url' in df_urls.columns:
    urls_to_check = df_urls['url'].tolist()
    print(f"\n‚úÖ Loaded {len(urls_to_check)} URLs from CSV file")
    print(f"\nSample URLs (first 5):")
    for i, url in enumerate(urls_to_check[:5], 1):
        print(f"  {i}. {url}")
else:
    print("\n‚ùå No 'url' column found in CSV file")
    urls_to_check = []

print("\n" + "=" * 80)

LOADING URLS FROM CSV FILE

‚úÖ Loaded 1806 URLs from CSV file

Sample URLs (first 5):
  1. https://olx.ba/artikal/71698283
  2. https://olx.ba/artikal/71573692
  3. https://olx.ba/artikal/70756410
  4. https://olx.ba/artikal/71554658
  5. https://olx.ba/artikal/71474528


‚úÖ Loaded 1806 URLs from CSV file

Sample URLs (first 5):
  1. https://olx.ba/artikal/71698283
  2. https://olx.ba/artikal/71573692
  3. https://olx.ba/artikal/70756410
  4. https://olx.ba/artikal/71554658
  5. https://olx.ba/artikal/71474528



In [2]:
import time
import random
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service


def clean_text(text):
    """Clean extracted text"""
    if not text:
        return None
    return ' '.join(text.strip().split())


def extract_coordinates_from_google_maps(soup):
    """
    Extract latitude and longitude from Google Maps embedded in the page.
    Works for:
    - ll=LAT,LON
    - @LAT,LON,16z
    - !3dLAT!4dLON
    - JS objects: lat: X, lng: Y
    """
    try:
        # ‚úÖ METHOD 1 ‚Äî Direct Google Maps links (ll=)
        google_links = soup.find_all("a", href=re.compile(r"google\.com/maps"))
        for link in google_links:
            href = link.get("href", "")

            ll_match = re.search(r"ll=([-\d.]+),([-\d.]+)", href)
            if ll_match:
                lat = float(ll_match.group(1))
                lon = float(ll_match.group(2))
                print("  ‚úÖ Found coords via ll=")
                return lat, lon

        # ‚úÖ METHOD 2 ‚Äî iframe src (@lat,lon,zoom)
        iframes = soup.find_all("iframe")
        for iframe in iframes:
            src = iframe.get("src", "")

            at_match = re.search(r"@([-\d.]+),([-\d.]+),\d+z", src)
            if at_match:
                lat = float(at_match.group(1))
                lon = float(at_match.group(2))
                print("  ‚úÖ Found coords via @lat,lon")
                return lat, lon

            d_match = re.search(r"!3d([-\d.]+)!4d([-\d.]+)", src)
            if d_match:
                lat = float(d_match.group(1))
                lon = float(d_match.group(2))
                print("  ‚úÖ Found coords via !3d !4d")
                return lat, lon

        # ‚úÖ METHOD 3 ‚Äî JavaScript objects
        scripts = soup.find_all("script")
        for script in scripts:
            if not script.string:
                continue

            js_match = re.search(
                r"lat[\"']?\s*:\s*([-\d.]+).*?lng[\"']?\s*:\s*([-\d.]+)",
                script.string
            )
            if js_match:
                lat = float(js_match.group(1))
                lon = float(js_match.group(2))
                print("  ‚úÖ Found coords via JS object")
                return lat, lon

        # ‚úÖ METHOD 4 ‚Äî Raw array format [lat, lon]
        for script in scripts:
            if not script.string:
                continue

            array_match = re.search(r"\[([-\d.]+)\s*,\s*([-\d.]+)\]", script.string)
            if array_match:
                lat = float(array_match.group(1))
                lon = float(array_match.group(2))

                # Optional Sarajevo validation
                if 43.5 < lat < 44.0 and 18.0 < lon < 18.7:
                    print("  ‚úÖ Found coords via raw array")
                    return lat, lon

        return None, None

    except Exception as e:
        print(f"  ‚ö†Ô∏è Coordinate extraction error: {e}")
        return None, None


def extract_address_from_olx(url, driver):
    """
    Extract address info + coordinates from OLX listing
    """
    try:
        driver.get(url)
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "lxml")

        address_info = {
            "url": url,
            "address": None,
            "location_details": None,
            "map_available": False,
            "latitude": None,
            "longitude": None
        }

        # ‚úÖ COORDINATES
        lat, lon = extract_coordinates_from_google_maps(soup)
        if lat and lon:
            address_info["latitude"] = lat
            address_info["longitude"] = lon
            address_info["map_available"] = True
            print(f"  ‚úÖ Coordinates: {lat}, {lon}")

        # ‚úÖ ADDRESS LABELS
        address_labels = ["Adresa", "Lokacija", "Ulica", "Mjesto"]

        for label in address_labels:
            label_element = soup.find(string=lambda t: t and label in t)
            if label_element:
                parent = label_element.find_parent()
                if parent:
                    value = parent.find_next("h4") or parent.find_next("span") or parent.find_next("div")
                    if value:
                        address_info["address"] = clean_text(value.get_text())
                        address_info["location_details"] = f"Found via '{label}'"
                        break

        # ‚úÖ LOCATION PILL
        if not address_info["address"]:
            location_pill = soup.find("div", class_="btn-pill city")
            if location_pill:
                for svg in location_pill.find_all("svg"):
                    svg.decompose()
                address_info["address"] = clean_text(location_pill.get_text())
                address_info["location_details"] = "Found in location pill"

        # ‚úÖ MAP DETECTION
        if not address_info["map_available"]:
            map_divs = soup.find_all("div", class_=lambda x: x and "map" in x.lower())
            if map_divs:
                address_info["map_available"] = True

        # ‚úÖ META ADDRESS
        if not address_info["address"]:
            meta = soup.find("meta", property="og:street-address")
            if meta and meta.get("content"):
                address_info["address"] = meta["content"]
                address_info["location_details"] = "Found in meta tag"

        return address_info

    except Exception as e:
        print(f"  ‚ùå Error scraping {url}: {e}")
        return {
            "url": url,
            "address": None,
            "location_details": f"Error: {str(e)}",
            "map_available": False,
            "latitude": None,
            "longitude": None
        }


print("=" * 80)
print("‚úÖ Google Maps Latitude/Longitude Scraper Ready")
print("=" * 80)


‚úÖ Google Maps Latitude/Longitude Scraper Ready

‚úÖ Google Maps Latitude/Longitude Scraper Ready


### Test Coordinate Extraction

Let's test the coordinate extraction function with one URL to make sure it works.

In [None]:
import pandas as pd
import time
import random
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.firefox.options import Options


INPUT_CSV = "../data/sarajevo_flats_olx.csv"
OUTPUT_CSV = "../data/sarajevo_flats_olx_with_coordinates.csv"
URL_COLUMN = "url"

MAX_WORKERS = 4      
DELAY_MIN = 1.5
DELAY_MAX = 3.5

csv_lock = threading.Lock()   


df = pd.read_csv(INPUT_CSV)

if URL_COLUMN not in df.columns:
    raise ValueError(f"Column '{URL_COLUMN}' not found in CSV!")



if not os.path.exists(OUTPUT_CSV):
    pd.DataFrame(columns=["url", "latitude", "longitude"])\
        .to_csv(OUTPUT_CSV, index=False)
    print("‚úÖ Created new output CSV")
else:
    print("‚úÖ Output CSV already exists ‚Äî continuing...")



def create_driver():
    options = Options()
    options.add_argument("--headless")
    return webdriver.Firefox(options=options)


def process_single_url(index, url):
    driver = create_driver()

    try:
        print(f"üîç [{index}] Processing ‚Üí {url}")

        result = extract_address_from_olx(url, driver)

        latitude = result.get("latitude")
        longitude = result.get("longitude")

        row = {
            "url": url,
            "latitude": latitude,
            "longitude": longitude
        }


        with csv_lock:
            pd.DataFrame([row]).to_csv(
                OUTPUT_CSV,
                mode="a",
                header=False,
                index=False
            )

        print(f"‚úÖ [{index}] Saved ‚Üí {latitude}, {longitude}")

    except Exception as e:
        print(f"‚ùå [{index}] Failed ‚Üí {e}")

        # ‚úÖ Save failure too
        with csv_lock:
            pd.DataFrame([{
                "url": url,
                "latitude": None,
                "longitude": None
            }]).to_csv(
                OUTPUT_CSV,
                mode="a",
                header=False,
                index=False
            )

    finally:
        driver.quit()

        # ‚úÖ Anti-ban delay PER THREAD
        time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))



print(f"\nStarting multithreaded scraping with {MAX_WORKERS} workers...\n")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = []

    for index, row in df.iterrows():
        url = row[URL_COLUMN]
        futures.append(
            executor.submit(process_single_url, index + 1, url)
        )

    for future in as_completed(futures):
        pass   # keeps main thread alive


print("\n‚úÖ SCRAPING COMPLETE ‚Äî ALL THREADS FINISHED")


‚úÖ Created new output CSV

Starting multithreaded scraping with 4 workers...

üîç [2] Processing ‚Üí https://olx.ba/artikal/71573692
üîç [2] Processing ‚Üí https://olx.ba/artikal/71573692
üîç [3] Processing ‚Üí https://olx.ba/artikal/70756410
üîç [3] Processing ‚Üí https://olx.ba/artikal/70756410
üîç [1] Processing ‚Üí https://olx.ba/artikal/71698283
üîç [1] Processing ‚Üí https://olx.ba/artikal/71698283
üîç [4] Processing ‚Üí https://olx.ba/artikal/71554658
üîç [4] Processing ‚Üí https://olx.ba/artikal/71554658
‚úÖ [1] Saved ‚Üí None, None
  ‚úÖ Found coords via ll=
  ‚úÖ Coordinates: 43.714503, 18.288786
‚úÖ [4] Saved ‚Üí 43.714503, 18.288786
  ‚úÖ Found coords via ll=
  ‚úÖ Coordinates: 43.901206, 18.354117
‚úÖ [3] Saved ‚Üí 43.901206, 18.354117
  ‚úÖ Found coords via ll=
  ‚úÖ Coordinates: 43.850305, 18.37447
‚úÖ [2] Saved ‚Üí 43.850305, 18.37447
‚úÖ [1] Saved ‚Üí None, None
  ‚úÖ Found coords via ll=
  ‚úÖ Coordinates: 43.714503, 18.288786
‚úÖ [4] Saved ‚Üí 43.714503, 18.