In [None]:
import time
import pandas as pd
import requests
import logging
from datetime import datetime

# pip install shapely
from shapely.geometry import shape

# Configure logging
log_filename = f"nominatim_lookup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # Also print to console
    ]
)
logger = logging.getLogger(__name__)

logger.info("="*80)
logger.info("Starting Nominatim Street Centroid Lookup")
logger.info("="*80)

IN_FILE = "../../../data/malta_street_dimension_final_v2.csv"
OUT_FILE = "../../../data/malta_street_dimension_with_centroids.csv"

logger.info(f"Input file: {IN_FILE}")
logger.info(f"Output file: {OUT_FILE}")

df = pd.read_csv(IN_FILE)
logger.info(f"Loaded {len(df)} street records")

# Work at canonical street grain (avoid repeated lookups for variants)
canon = df[df["is_canonical"] == True][["street_id", "street", "town_name"]].drop_duplicates()
logger.info(f"Found {len(canon)} canonical streets to look up")

def nominatim_lookup(street, town, max_retries=3):
    # Ask for geometry where available
    params = {
        "q": f"{street}, {town}, Malta",
        "format": "jsonv2",
        "limit": 1,
        "polygon_geojson": 1
    }
    headers = {
        # Required by Nominatim usage policy: identify your app
        "User-Agent": "ICS5110-Assignment/1.0 (Malta Street Centroids)"
    }
    
    for attempt in range(max_retries):
        try:
            r = requests.get("https://nominatim.openstreetmap.org/search", params=params, headers=headers, timeout=30)
            r.raise_for_status()
            data = r.json()
            if not data:
                logger.warning(f"No results from Nominatim for: {street}, {town}")
                return None

            hit = data[0]
            # If geojson exists, compute centroid from geometry
            if "geojson" in hit and hit["geojson"]:
                try:
                    geom = shape(hit["geojson"])
                    c = geom.centroid
                    logger.debug(f"Found OSM centroid for {street}, {town}: ({c.y}, {c.x})")
                    return float(c.y), float(c.x), "osm_centroid"
                except Exception as e:
                    logger.debug(f"Failed to compute centroid from geojson for {street}, {town}: {e}")
                    pass

            # Fallback: point location from Nominatim result
            logger.debug(f"Using OSM point location for {street}, {town}: ({hit['lat']}, {hit['lon']})")
            return float(hit["lat"]), float(hit["lon"]), "osm_point"
        except requests.exceptions.HTTPError as e:
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s
                logger.warning(f"HTTPError for {street}, {town}: {e.response.status_code}. Retrying in {wait_time}s... (Attempt {attempt+1}/{max_retries})")
                time.sleep(wait_time)
            else:
                logger.error(f"Failed after {max_retries} attempts: {street}, {town}")
                return None
        except Exception as e:
            logger.error(f"Unexpected error for {street}, {town}: {type(e).__name__}: {e}")
            return None

results = []
total = len(canon)
failed_count = 0
success_count = 0
not_found_count = 0

logger.info(f"Starting lookup of {total} canonical streets...")

for idx, (i, row) in enumerate(canon.iterrows(), 1):
    street_id, street, town = row["street_id"], row["street"], row["town_name"]
    logger.info(f"[{idx}/{total}] Looking up: {street}, {town}")
    try:
        res = nominatim_lookup(street, town)
        if res:
            lat, lon, method = res
            success_count += 1
            logger.info(f"  ✓ Found via {method}")
        else:
            lat, lon, method = None, None, "not_found"
            not_found_count += 1
            logger.warning(f"  ✗ Not found in Nominatim")
    except Exception as e:
        lat, lon, method = None, None, f"error:{type(e).__name__}"
        failed_count += 1
        logger.error(f"  ✗ Exception: {e}")

    results.append({
        "street_id": street_id,
        "street_latitude": lat,
        "street_longitude": lon,
        "street_location_method": method
    })

    # Be nice to Nominatim (rate limit)
    time.sleep(1.5)

logger.info("="*80)
logger.info(f"Lookup complete!")
logger.info(f"  Successful: {success_count}")
logger.info(f"  Not found: {not_found_count}")
logger.info(f"  Failed: {failed_count}")
logger.info("="*80)

cent = pd.DataFrame(results)

# Join back to full dimension (so variants inherit the canonical street centroid)
out = df.merge(cent, on="street_id", how="left")

out.to_csv(OUT_FILE, index=False)
logger.info(f"Output written to: {OUT_FILE}")
logger.info(f"Log file saved to: {log_filename}")

print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(out[["street", "town_name", "street_latitude", "street_longitude", "street_location_method"]].head(20))
print("="*80)

Wrote: ../../../data/malta_street_dimension_with_centroids.csv
           street town_name street_latitude street_longitude  \
0  aldo moro road     marsa            None             None   
1  aldo moro road     marsa            None             None   
2  aldo moro road     marsa            None             None   
3  aldo moro road     marsa            None             None   
4  aldo moro road     marsa            None             None   

  street_location_method  
0        error:HTTPError  
1        error:HTTPError  
2        error:HTTPError  
3        error:HTTPError  
4        error:HTTPError  
