In [None]:
# Author: Matheus Gomes Correia
# License: CC-BY-NC-SA 4.0

# Section 1: Import libraries

In [1]:
import osmnx as ox
import pandas as pd
import geopandas as gpd
import numpy as np
import mapillary.interface as mly
import requests
import json
from shapely.geometry import Point, Polygon, LineString
import fiona
from packaging import version
import warnings
import os
from contextlib import redirect_stderr
import logging
from fiona.errors import DriverError
import re
import time
from collections import OrderedDict
import unicodedata
import glob
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import urllib.parse
import ast 
from dotenv import load_dotenv



In [2]:
# --- Filter out DeprecationWarnings ---
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Configure logging
#logging.basicConfig(level=logging.WARNING)
logging.basicConfig(level=logging.INFO)

In [3]:
# Temporal filter
existed_at = '2018-01-01 00:00:00'
existed_before = '2090-12-31 23:59:59'

way_tags = {
        "crossing": ["uncontrolled", "traffic_signals", "unmarked"],
        "highway": ["traffic_signals", "stop", "give_way"]
    }
traffic_sign_tags = {"traffic_sign": True}

In [4]:
# Initialize Nominatim geocoder
geolocator = Nominatim(user_agent="cav-assessment-matheus-correia")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1) #rate limit

# --- Fallback Dictionary (Option 1) ---
place_to_country = {
    "nuku'alofa": "TO",
    "apia": "WS",
    "honiara": "SB",
    "reykjavik": "IS",
    "suva": "FJ",
    "national capital district": "PG",
    "wellington city": "NZ",
    "kigali": "RW",
    "oslo": "NO",
    "zurich": "CH",
    "san josé": "CR",
    "auckland": "NZ",
    "sofia": "BG",
    "belgrade": "RS",
    "montevideo": "UY",
    "calgary": "CA",
    "distrito de panama": "PA",
    "vienna": "AT",
    "dublin": "IE",
    "perth": "AU",
    "bucharest": "RO",
    "brussels": "BE",
    "la paz": "BO",
    "baku": "AZ",
    "doha": "QA",
    "beirut": "LB",
    "amsterdam": "NL",
    "brisbane": "AU",
    "vancouver": "CA",
    "ciudad de túnez": "TN",
    "caracas": "VE",
    "budapest": "HU",
    "tashkent": "UZ",
    "kyiv": "UA",
    "warsaw": "PL",
    "quito": "EC",
    "dakar": "SN",
    "lusaka": "ZM",
    "dubai": "AE",
    "algiers": "DZ",
    "casablanca": "MA",
    "kampala": "UG",
    "medellín": "CO",
    "berlin": "DE",
    "rome": "IT",
    "montreal": "CA",
    "amman": "JO",
    "san francisco": "US",
    "city of cape town": "ZA",
    "boston": "US",
    "accra": "GH",
    "monterrey": "MX",
    "nairobi": "KE",
    "melbourne": "AU",
    "guadalajara": "MX",
    "sydney": "AU",
    "alexandria": "EG",
    "colombo": "LK",
    "yangon": "MM",
    "singapore": "SG",
    "miami": "US",
    "toronto": "CA",
    "madrid": "ES",
    "santiago": "CL",
    "houston": "US",
    "riyadh governorate": "SA",
    "baghdad": "IQ",
    "dar es salaam": "TZ",
    "kuala lumpur": "MY",
    "chicago": "US",
    "tehran": "IR",
    "bangkok": "TH",
    "provincia de lima": "PE",
    "paris": "FR",
    "bogotá": "CO",
    "chennai": "IN",
    "ho chi minh city": "VN",
    "manila": "PH",
    "london": "GB",
    "buenos aires": "AR",
    "istanbul": "TR",
    "lagos": "NG",
    "karachi": "PK",
    "moscow": "RU",
    "los angeles": "US",
    "new york": "US",
    "mumbai": "IN",
    "beijing": "CN",
    "mexico city": "MX",
    "cairo": "EG",
    "município de são paulo": "BR",
    "dhaka": "BD",
    "shanghai": "CN",
    "delhi": "IN",
    "jakarta": "ID",
    "coimbra": "PT",
    "hamburg": "DE",
    "fortaleza": "BR",
    "tokyo": "JP"
}

# Section 2: Define Constants and File Paths

In [5]:
notebook_dir = os.path.abspath('')
PROJECT_ROOT = os.path.dirname(notebook_dir)
INPUT_DATA_DIR = os.path.join(PROJECT_ROOT, "input_data")
RESULTS_DIR = os.path.join(PROJECT_ROOT, "results")
DATA_DIR = os.path.join(PROJECT_ROOT, "data")

dotenv_path = os.path.join(PROJECT_ROOT, '.env')
load_dotenv()

MLY_ACCESS_TOKEN = os.getenv("MLY_ACCESS_TOKEN")

if not MLY_ACCESS_TOKEN:
    raise ValueError("MLY_ACCESS_TOKEN not found! Please create a .env file with your token.")

mly.set_access_token(MLY_ACCESS_TOKEN)

UTM_CRS = 'EPSG:3857'
BUFFER_DISTANCE = 10
OSM_TAG_COLS = [f'osm_tag_{i}' for i in range(1, 6)]

POINTS_CSV = os.path.join(INPUT_DATA_DIR, "points.csv")
TRAFFIC_SIGNS_CSV = os.path.join(INPUT_DATA_DIR, "traffic_signs.csv")
TRANSPORT_AGENCY_DATA = os.path.join(INPUT_DATA_DIR, "transport_agency.geojson")

# --- Initialize and Load Transport Agency Data ---
transport_agency_osm_df = None
transport_agency_data_exists = False

try:
    transport_agency_osm_df = gpd.read_file(TRANSPORT_AGENCY_DATA)
    if transport_agency_osm_df.crs != UTM_CRS:
        transport_agency_osm_df = transport_agency_osm_df.to_crs(UTM_CRS)
    transport_agency_osm_df.geometry = transport_agency_osm_df.geometry.buffer(0)
    transport_agency_data_exists = True
except (FileNotFoundError, DriverError) as e:
    logging.warning(f"Transport agency data file '{TRANSPORT_AGENCY_DATA}' not found. Proceeding without it. Error: {e}")

ERROR:fiona._env:/Users/Matheus/Library/CloudStorage/OneDrive-Pessoal/Estudos/2 - Pesquisas/3 - CITTA/1 - RAMCCAV/3 - Assessment of Physical Road Infrastructure/Codigos/input_data/transport_agency.geojson: No such file or directory


# Section 3: Helper Functions

In [6]:
def load_and_preprocess_csv(filepath, osm_tag_cols=OSM_TAG_COLS):
    """Loads a CSV, handles NaN in OSM tag columns, and converts them to strings."""
    try:
        df = pd.read_csv(filepath, sep=';')
        for col in osm_tag_cols:
            if col in df.columns:
                df[col] = df[col].astype(str).replace('nan', '')
        return df
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found. Creating an empty DataFrame.")
        return pd.DataFrame(columns=osm_tag_cols)

In [7]:
def load_mappings():
    # Load global Mapillary -> OSM mappings (points.csv and traffic_signs.csv)
    mapillary_points_df = load_and_preprocess_csv(POINTS_CSV)
    mapillary_traffic_signs_df = load_and_preprocess_csv(TRAFFIC_SIGNS_CSV)

    # Concatenate and handle NaNs
    mapillary_osm_mapping = pd.concat([mapillary_points_df, mapillary_traffic_signs_df], ignore_index=True).dropna(subset=OSM_TAG_COLS, how='all')
    mapillary_osm_mapping['mapillary_feature'] = mapillary_osm_mapping['mapillary_feature'].astype(str)
    mapillary_osm_mapping = mapillary_osm_mapping.fillna('')  # Replace any remaining NaN with empty string

    # Load country-specific mappings
    country_mappings = {}
    country_signs_path_pattern = os.path.join(INPUT_DATA_DIR, "countries_signs", "*_traffic_signs.csv")
    for filename in glob.glob(country_signs_path_pattern):
        country_code = os.path.basename(filename).split("_")[0]
        try:
            # Specify the delimiter as a semicolon!
            country_df = pd.read_csv(filename, encoding='utf-8', sep=';')
            country_mappings[country_code] = country_df
        except UnicodeDecodeError:
            try:
                # If utf-8 fails, try latin-1
                country_df = pd.read_csv(filename, encoding='latin-1', sep=';')
                country_mappings[country_code] = country_df
            except Exception as e:
                print(f"Error reading {filename}: {e}")
                continue
        except Exception as e: #Catch other exceptions
            print(f"Error reading {filename}: {e}")
            continue

    return mapillary_osm_mapping, country_mappings

In [8]:
# --- sanitize_filename function ---
def sanitize_filename(name):
    """Sanitizes a string for use as a filename."""
    name = name.split(',')[0].strip()  # Keep only the part before the first comma
    name = name.lower()
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
    name = re.sub(r'[^\w\s-]', '', name).strip()
    name = re.sub(r'[-\s]+', '_', name)
    return name

In [9]:
# --- create_city_directories function ---
def create_city_directories(base_dir, city_id):
    """Creates the directory structure for a city."""
    city_dir = os.path.join(base_dir, city_id)
    os.makedirs(city_dir, exist_ok=True)  # Create city directory

    for subdir in ["raw", "OSM", "Mapillary", "missing", "combined"]:
        os.makedirs(os.path.join(city_dir, subdir), exist_ok=True)

In [10]:
def convert_lists_to_strings(gdf):
    """Converts list-type columns in a GeoDataFrame to strings. Prints warnings for unsupported data types."""
    for col in gdf.columns:
        if any(isinstance(val, list) for val in gdf[col]):
            print(f"Converting list-type column '{col}' to string.")
            gdf[col] = gdf[col].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
        elif any(isinstance(val, (dict, set)) for val in gdf[col]):
            print(f"Warning: Column '{col}' contains unsupported data types (dict/set). Skipping.")
    return gdf

In [11]:
def create_schema(gdf):
    """
    Creates a Fiona schema from a GeoDataFrame.  Handles empty GeoDataFrames
    correctly and dynamically determines geometry type.
    """
    if gdf.empty:
        return {
            'geometry': 'Point',
            'properties': {'id': 'int'}
        }

    # Get GeoDataFrame's geometry type.
    # Correctly check for empty geometry series

    try:
        geom_type = gdf.geometry.geom_type[0]
    except (KeyError, IndexError, AttributeError): #handle errors when accessing geom_type
        geom_type = 'Point' # Default

    schema = {
        'geometry': geom_type,
        'properties': OrderedDict()  # Use OrderedDict to preserve column order
    }
    for col_name, col_type in gdf.dtypes.items():
        if col_name == 'geometry':
            continue  # Geometry is handled separately

        # Convert pandas/numpy types to Fiona types
        if col_type == 'object':
            fiona_type = 'str'
        elif col_type == 'int64':
            fiona_type = 'int'
        elif col_type == 'float64':
            fiona_type = 'float'
        elif col_type == 'bool':
            fiona_type = 'bool'
        else:
            fiona_type = 'str'  # Default to string for unknown types.
            print(f"Warning: Unknown column type {col_type} for column {col_name}.  Using 'str'.")

        schema['properties'][col_name] = fiona_type

    return schema

In [12]:
def load_data(processed_filepath, raw_filepath, download_func, *download_args):
    """
    Attempts to load data with prioritized reading (processed, raw, download).
    Saves an empty file if download returns no features.
    Renames 'index_left' and 'index_right' columns.
    """
    try:
        # 1. Try to read processed data
        logging.info(f"Attempting to read processed data from: {processed_filepath}")
        data = gpd.read_file(processed_filepath, driver='GeoJSON')
        # Rename columns if they exist
        if 'index_left' in data.columns:
            data = data.rename(columns={'index_left': 'orig_index_left'})
        if 'index_right' in data.columns:
            data = data.rename(columns={'index_right': 'orig_index_right'})
        return data
    except (FileNotFoundError, DriverError):
        logging.warning(f"Processed data file not found or unreadable: {processed_filepath}")
        try:
            # 2. Try to read raw data
            logging.info(f"Attempting to read raw data from: {raw_filepath}")
            data = gpd.read_file(raw_filepath, driver='GeoJSON')
            # Rename columns if they exist
            if 'index_left' in data.columns:
                data = data.rename(columns={'index_left': 'orig_index_left'})
            if 'index_right' in data.columns:
                data = data.rename(columns={'index_right': 'orig_index_right'})

            return data
        except (FileNotFoundError, DriverError):
            logging.warning(f"Raw data file not found or unreadable: {raw_filepath}")
            try:
                # 3. Download data
                logging.info(f"Downloading data...")
                data = download_func(*download_args)

                if not data.empty:
                    # Save downloaded data to raw file
                    logging.info(f"Saving {len(data)} downloaded raw features to: {raw_filepath}") # Log count
                    # Use a copy to avoid modifying original 'data' before list conversion
                    data_to_save_raw = convert_lists_to_strings(data.copy())
                    try:
                         # Let GeoPandas/Fiona infer schema for raw GeoJSON, which can handle mixed types better.
                         # Do NOT provide explicit schema here for non-empty raw data.
                         data_to_save_raw.to_file(raw_filepath, driver="GeoJSON")
                         logging.info(f"Successfully saved raw data to: {raw_filepath}")
                    except Exception as save_err:
                         # Log the error, potentially including the geometry types present
                         geom_types_present = data_to_save_raw.geom_type.unique() if not data_to_save_raw.empty else []
                         logging.error(f"!!! Failed to save raw data to {raw_filepath}. Geometry types present: {geom_types_present}. Error: {save_err}")
                         # Decide if you want to return empty or raise error
                         return gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                else:
                    # Save an EMPTY GeoDataFrame - Here it is needed a schema
                    logging.warning(f"No features found during download. Saving empty raw file to: {raw_filepath}")
                    data = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS) # Use data here (which is now empty GDF)
                    try:
                        # Create schema for empty GDF (required by Fiona)
                        schema = create_schema(data)
                        data.to_file(raw_filepath, driver="GeoJSON", schema=schema)
                        logging.info(f"Successfully saved empty raw file to: {raw_filepath}")
                    except Exception as save_err:
                         logging.error(f"!!! Failed to save empty raw data file to {raw_filepath}. Error: {save_err}")
                         # Decide if you want to return empty or raise error
                         return gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)

                #Rename columns:
                if 'index_left' in data.columns:
                    data = data.rename(columns={'index_left': 'orig_index_left'})
                if 'index_right' in data.columns:
                    data = data.rename(columns={'index_right': 'orig_index_right'})

                return data
            except Exception as e:
                logging.error(f"Error during download or raw save: {e}")
                return gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)  # Return empty GeoDataFrame on failure

In [13]:
def download_osm_data(tags, polygon, crs=UTM_CRS):
    """Downloads OSM data based on tags and polygon, projects to UTM, and preprocesses."""
    osm_data = gpd.GeoDataFrame()
    for tag in tags:
        if "=" in tag:
            key, value = tag.split("=")
            try:
                osm_data_part = ox.geometries_from_polygon(polygon, tags={key: value})
                osm_data = pd.concat([osm_data, osm_data_part])
            except ox._errors.EmptyOverpassResponse:
                print(f"Warning: No OSM data found for tag {tag} in the specified area.")
                continue  # Skip to the next tag

    if osm_data.empty:
        print("Warning: No OSM data found for any of the specified tags.")
        return gpd.GeoDataFrame(geometry=[], crs=crs)  # Return empty but valid GeoDataFrame

    osm_data = osm_data.to_crs(crs)
    osm_data = osm_data.reset_index(drop=True)
    return osm_data

In [14]:
def download_mapillary_data(bbox, filter_values, country_code, country_mappings, existed_at=None, existed_before=None, crs=UTM_CRS):
    """
    Downloads Mapillary data and filters it locally.
    """
    print(f"download_mapillary_data called with filter_values: {filter_values}")
    
    # --- Prepare Temporal Filters ---
    temporal_filters = {}
    if existed_at:
        temporal_filters['existed_at'] = existed_at
    if existed_before:
        temporal_filters['existed_before'] = existed_before

    # --- Download ALL traffic signs and points ---
    try:
        print(f"Calling Mapillary API with bbox: {bbox}")
        data_traffic_signs = json.loads(
            mly.traffic_signs_in_bbox(bbox, **temporal_filters)  # NO filter_values here
        )
        data_points = json.loads(
            mly.map_feature_points_in_bbox(bbox, **temporal_filters)  # NO filter_values here
        )
        features = data_traffic_signs.get('features', []) + data_points.get('features', [])

    except Exception as e:
        print(f"Error in Mapillary API calls: {e}")
        return gpd.GeoDataFrame(crs=crs)

    if not features:
        print("No features returned from Mapillary API calls.")
        return gpd.GeoDataFrame(crs=crs)

    mapillary_data_gdf = gpd.GeoDataFrame.from_features(features, crs='EPSG:4326')

    # --- Deduplicate Columns ---
    if not mapillary_data_gdf.empty:
        new_cols = []
        seen_cols = set()
        for col in mapillary_data_gdf.columns:
            if col in seen_cols:
                i = 1
                new_col = f"{col}_{i}"
                while new_col in seen_cols:
                    i += 1
                    new_col = f"{col}_{i}"
                new_cols.append(new_col)
            else:
                new_cols.append(col)
            seen_cols.add(new_cols[-1])
        mapillary_data_gdf.columns = new_cols
        mapillary_data_gdf = mapillary_data_gdf.to_crs(crs)

        # --- *NOW* Filter the DataFrame ---
        if 'value' in mapillary_data_gdf.columns:  # Check if 'value' exists
             mapillary_data_gdf = mapillary_data_gdf[mapillary_data_gdf['value'].isin(filter_values)]
        else:
             return gpd.GeoDataFrame(crs=crs) #Return empty if value column does not exist


    return mapillary_data_gdf

In [15]:
def filter_mapillary_data(mapillary_data_gdf, polygon):
    """
    Filters Mapillary data to retain only items within the original search area (polygon).
    """
    if not mapillary_data_gdf.empty:
        # Ensure both GeoDataFrames are in the same CRS
        mapillary_data_gdf = mapillary_data_gdf.to_crs(polygon.crs)

        # --- Rename columns BEFORE sjoin ---
        if 'index_left' in mapillary_data_gdf.columns:
            mapillary_data_gdf = mapillary_data_gdf.rename(columns={'index_left': 'orig_index_left'})
        if 'index_right' in mapillary_data_gdf.columns:
            mapillary_data_gdf = mapillary_data_gdf.rename(columns={'index_right': 'orig_index_right'})
        if 'index_left' in polygon.columns:
            polygon = polygon.rename(columns={'index_left': 'orig_index_left'})
        if 'index_right' in polygon.columns:
            polygon = polygon.rename(columns={'index_right': 'orig_index_right'})

        # Perform spatial join to filter items within the polygon
        filtered_data = gpd.sjoin(mapillary_data_gdf, polygon, predicate="within", how = 'inner')
        return filtered_data
    return gpd.GeoDataFrame()

In [16]:
def analyze_overlap_2dfs(df2, df3, properties_to_keep=None, buffer_distance=BUFFER_DISTANCE):
    """
    Analyzes overlap (2 DataFrames), with detailed debugging for 'highway'.
    """
    def combine_dataframes(df_base, df_add, properties_to_keep):

        # --- Explicitly handle 'highway' ---
        df_add['highway'] = ''
        for osm_tag_col in OSM_TAG_COLS:
            if osm_tag_col in df_add.columns:
                df_add['highway'] = np.where(
                    (df_add[osm_tag_col].notna()) & (df_add[osm_tag_col].str.startswith('highway=')),
                    df_add[osm_tag_col].str.replace('highway=', '', 1),
                    df_add['highway']
                )
    
        exclude_cols = df_add.columns.intersection(['index_left', 'index_right'])
    
        # --- Prepare df_add_processed ---
        if properties_to_keep is None:
            cols_to_add = df_add.columns.difference(df_base.columns).union(['geometry'])
            cols_to_add = cols_to_add.difference(exclude_cols)
             # Ensure geometry column is present if df_add is a GeoDataFrame
            if 'geometry' not in cols_to_add and isinstance(df_add, gpd.GeoDataFrame) and 'geometry' in df_add.columns:
                cols_to_add = cols_to_add.union(['geometry'])
            df_add_processed = df_add[list(cols_to_add)].copy() # Use list for explicit column order
        else:
            keep_cols = [col for col in properties_to_keep if col in df_add.columns]
            # Ensure geometry column is present if df_add is a GeoDataFrame
            if 'geometry' not in keep_cols and isinstance(df_add, gpd.GeoDataFrame) and 'geometry' in df_add.columns:
                keep_cols.append('geometry')
            df_add_processed = df_add[keep_cols].copy()
            df_add_processed = df_add_processed.drop(columns=exclude_cols.intersection(df_add_processed.columns), errors='ignore')
    
        # --- Ensure Both are GeoDataFrames with Geometry (if possible) ---
        base_is_gdf = isinstance(df_base, gpd.GeoDataFrame) and 'geometry' in df_base.columns
        add_is_gdf = isinstance(df_add_processed, gpd.GeoDataFrame) and 'geometry' in df_add_processed.columns
    
        if base_is_gdf and add_is_gdf:
            base_crs = df_base.crs # Store CRS
    
            # --- Align CRS BEFORE concatenation ---
            if df_base.crs != df_add_processed.crs:
                print(f"Aligning df_add_processed CRS from {df_add_processed.crs} to {base_crs}")
                df_add_processed = df_add_processed.to_crs(base_crs)
    
            # --- Concatenate ---
            try:
                # Use sort=False to prevent potential column reordering issues
                combined_df = pd.concat([df_base, df_add_processed], ignore_index=True, sort=False)
    
                # --- Create GeoDataFrame after concat ---
                print("Creating GeoDataFrame after concat...")
                # Ensure the geometry column exists before creating GeoDataFrame
                if 'geometry' in combined_df.columns:
                    combined_gdf = gpd.GeoDataFrame(combined_df, geometry='geometry', crs=base_crs) # Use base CRS
                    print("----- Exiting combine_dataframes (GeoDataFrame) -----")
                    return combined_gdf
                else:
                    print("Warning: Geometry column lost during concat. Returning DataFrame.")
                    print("----- Exiting combine_dataframes (DataFrame - No Geometry) -----")
                    return combined_df # Return as DataFrame if geometry is lost
    
            except Exception as e:
                print(f"Error during GeoDataFrame creation after concat: {e}. Returning DataFrame.")
                # Fallback to regular DataFrame, ensure geometry is dropped if it caused issues
                combined_df = pd.concat([df_base, df_add_processed], ignore_index=True, sort=False)
                if 'geometry' in combined_df.columns:
                     combined_df = combined_df.drop(columns=['geometry'], errors='ignore')
                print("----- Exiting combine_dataframes (DataFrame - Fallback) -----")
                return combined_df
    
        # --- Fallback cases if inputs are not GDFs ---
        else:
             print("Warning: One or both inputs are not GeoDataFrames with geometry. Concatenating attributes only.")
             df_base_attrs = df_base.drop(columns=['geometry'], errors='ignore') if base_is_gdf else df_base
             df_add_processed_attrs = df_add_processed.drop(columns=['geometry'], errors='ignore') if add_is_gdf else df_add_processed
             combined_attrs = pd.concat([df_base_attrs, df_add_processed_attrs], ignore_index=True, sort=False)
             print("----- Exiting combine_dataframes (DataFrame - Attrs Only) -----")
             return combined_attrs

    for df in [df2, df3]:
        df.drop(columns=['index_left', 'index_right'], inplace=True, errors='ignore')
    df2 = df2.drop_duplicates(subset='geometry').reset_index(drop=True)
    df3 = df3.drop_duplicates(subset='geometry').reset_index(drop=True)
    df2_buffered = df2.copy()
    df3_buffered = df3.copy()
    df2_buffered.geometry = df2_buffered.geometry.buffer(buffer_distance)
    df3_buffered.geometry = df3_buffered.geometry.buffer(buffer_distance)
    overlap_2_3_buffered = gpd.sjoin(df3_buffered, df2_buffered, how="inner", predicate="intersects")
    overlap_2_3 = df3.loc[overlap_2_3_buffered.index].copy()
    overlap_2_3 = overlap_2_3.merge(overlap_2_3_buffered.drop(columns='geometry'), left_index=True, right_index=True)
    missing_indices = df3.index.difference(overlap_2_3.index)
    missing_2_3 = df3.loc[missing_indices].copy()

    df2['source'] = 'OSM'
    df3['source'] = 'Mapillary'
    overlap_2_3['source'] = df3['source']
    missing_2_3['source'] = df3['source']
    if 'highway' in df2.columns: #Keep highway if it already exists
        df2['highway'] = df2['highway']


    combined_2_3 = combine_dataframes(df2, missing_2_3, properties_to_keep)
    print("--- combined_2_3 (first 5 rows):\n", combined_2_3.head())  # Debug print
    print("--- combined_2_3 dtypes:\n", combined_2_3.dtypes)  # Debug print
    return combined_2_3, overlap_2_3, missing_2_3

In [17]:
def analyze_overlap_3dfs(df1, df2, df3, properties_to_keep=None, buffer_distance=BUFFER_DISTANCE):
    """
    Analyzes overlap (3 DataFrames), with detailed debugging for 'highway'.
    """
    def combine_dataframes(df_base, df_add, properties_to_keep):

        # --- Explicitly handle 'highway' ---
        df_add['highway'] = ''
        for osm_tag_col in OSM_TAG_COLS:
            if osm_tag_col in df_add.columns:
                df_add['highway'] = np.where(
                    (df_add[osm_tag_col].notna()) & (df_add[osm_tag_col].str.startswith('highway=')),
                    df_add[osm_tag_col].str.replace('highway=', '', 1),
                    df_add['highway']
                )
    
        exclude_cols = df_add.columns.intersection(['index_left', 'index_right'])
    
        # --- Prepare df_add_processed ---
        if properties_to_keep is None:
            cols_to_add = df_add.columns.difference(df_base.columns).union(['geometry'])
            cols_to_add = cols_to_add.difference(exclude_cols)
             # Ensure geometry column is present if df_add is a GeoDataFrame
            if 'geometry' not in cols_to_add and isinstance(df_add, gpd.GeoDataFrame) and 'geometry' in df_add.columns:
                cols_to_add = cols_to_add.union(['geometry'])
            df_add_processed = df_add[list(cols_to_add)].copy() # Use list for explicit column order
        else:
            keep_cols = [col for col in properties_to_keep if col in df_add.columns]
            # Ensure geometry column is present if df_add is a GeoDataFrame
            if 'geometry' not in keep_cols and isinstance(df_add, gpd.GeoDataFrame) and 'geometry' in df_add.columns:
                keep_cols.append('geometry')
            df_add_processed = df_add[keep_cols].copy()
            df_add_processed = df_add_processed.drop(columns=exclude_cols.intersection(df_add_processed.columns), errors='ignore')
    
        # --- Ensure Both are GeoDataFrames with Geometry (if possible) ---
        base_is_gdf = isinstance(df_base, gpd.GeoDataFrame) and 'geometry' in df_base.columns
        add_is_gdf = isinstance(df_add_processed, gpd.GeoDataFrame) and 'geometry' in df_add_processed.columns
    
        if base_is_gdf and add_is_gdf:
            base_crs = df_base.crs # Store CRS
    
            # --- Align CRS BEFORE concatenation ---
            if df_base.crs != df_add_processed.crs:
                print(f"Aligning df_add_processed CRS from {df_add_processed.crs} to {base_crs}")
                df_add_processed = df_add_processed.to_crs(base_crs)
    
            # --- Concatenate ---
            try:
                # Use sort=False to prevent potential column reordering issues
                combined_df = pd.concat([df_base, df_add_processed], ignore_index=True, sort=False)
    
                # --- Create GeoDataFrame after concat ---
                print("Creating GeoDataFrame after concat...")
                # Ensure the geometry column exists before creating GeoDataFrame
                if 'geometry' in combined_df.columns:
                    combined_gdf = gpd.GeoDataFrame(combined_df, geometry='geometry', crs=base_crs) # Use base CRS
                    print("----- Exiting combine_dataframes (GeoDataFrame) -----")
                    return combined_gdf
                else:
                    print("Warning: Geometry column lost during concat. Returning DataFrame.")
                    print("----- Exiting combine_dataframes (DataFrame - No Geometry) -----")
                    return combined_df # Return as DataFrame if geometry is lost
    
            except Exception as e:
                print(f"Error during GeoDataFrame creation after concat: {e}. Returning DataFrame.")
                # Fallback to regular DataFrame, ensure geometry is dropped if it caused issues
                combined_df = pd.concat([df_base, df_add_processed], ignore_index=True, sort=False)
                if 'geometry' in combined_df.columns:
                     combined_df = combined_df.drop(columns=['geometry'], errors='ignore')
                print("----- Exiting combine_dataframes (DataFrame - Fallback) -----")
                return combined_df
    
        # --- Fallback cases if inputs are not GDFs ---
        else:
             print("Warning: One or both inputs are not GeoDataFrames with geometry. Concatenating attributes only.")
             df_base_attrs = df_base.drop(columns=['geometry'], errors='ignore') if base_is_gdf else df_base
             df_add_processed_attrs = df_add_processed.drop(columns=['geometry'], errors='ignore') if add_is_gdf else df_add_processed
             combined_attrs = pd.concat([df_base_attrs, df_add_processed_attrs], ignore_index=True, sort=False)
             print("----- Exiting combine_dataframes (DataFrame - Attrs Only) -----")
             return combined_attrs

    for df in [df1, df2, df3]:
        df.drop(columns=['index_left', 'index_right'], inplace=True, errors='ignore')

    df1 = df1.drop_duplicates(subset='geometry').reset_index(drop=True)
    df2 = df2.drop_duplicates(subset='geometry').reset_index(drop=True)
    df3 = df3.drop_duplicates(subset='geometry').reset_index(drop=True) #Add this

    df1_buffered = df1.copy()
    df2_buffered = df2.copy()
    df3_buffered = df3.copy()
    df1_buffered.geometry = df1_buffered.geometry.buffer(buffer_distance)
    df2_buffered.geometry = df2_buffered.geometry.buffer(buffer_distance)
    df3_buffered.geometry = df3_buffered.geometry.buffer(buffer_distance)

    overlap_1_2_buffered = gpd.sjoin(df2_buffered, df1_buffered, how="inner", predicate="intersects")
    overlap_1_2 = df2.loc[overlap_1_2_buffered.index].copy()
    overlap_1_2 = overlap_1_2.merge(overlap_1_2_buffered.drop(columns='geometry'), left_index=True, right_index=True)
    missing_indices_1_2 = df2.index.difference(overlap_1_2.index)
    missing_1_2 = df2.loc[missing_indices_1_2].copy()

    df1['source'] = 'transit_agency'
    df2['source'] = 'OSM'
    df3['source'] = 'Mapillary'
    overlap_1_2['source'] = df2['source']
    missing_1_2['source'] = df2['source']

    # --- Add 'highway' from df2 if it exists ---
    if 'highway' in df2.columns:
        df2['highway'] = df2['highway']
    combined_1_2 = combine_dataframes(df1, missing_1_2, properties_to_keep)

    overlap_2_3_buffered = gpd.sjoin(df3_buffered, combined_1_2, how="inner", predicate="intersects")
    overlap_2_3 = df3.loc[overlap_2_3_buffered.index].copy()
    overlap_2_3 = overlap_2_3.merge(overlap_2_3_buffered.drop(columns='geometry'), left_index=True, right_index=True)
    missing_indices_2_3 = df3.index.difference(overlap_2_3.index)
    missing_2_3 = df3.loc[missing_indices_2_3].copy()

    overlap_2_3['source'] = df3['source']
    missing_2_3['source'] = df3['source']

    combined_2_3 = combine_dataframes(combined_1_2, missing_2_3, properties_to_keep)
    print("--- combined_2_3 (first 5 rows):\n", combined_2_3.head())  # Debug print
    print("--- combined_2_3 dtypes:\n", combined_2_3.dtypes)
    return combined_2_3, combined_1_2, overlap_1_2, missing_1_2, overlap_2_3, missing_2_3

In [18]:
def convert_osm_tags_to_kv(gdf, osm_tag_cols=OSM_TAG_COLS):
    """Converts osm_tag_N columns to standard OSM key-value pairs, AND handles country_tag."""

    original_crs = getattr(gdf, 'crs', UTM_CRS) # Get CRS early

    if not isinstance(gdf, gpd.GeoDataFrame) or gdf.empty:
        # logging.debug("Input (Geo)DataFrame to convert_osm_tags_to_kv is empty. Returning empty.")
        return gpd.GeoDataFrame(geometry=[], crs=original_crs), [] # Return empty GDF with CRS

    gdf_copy = gdf.copy()
    if 'mapillary_feature' in gdf_copy.columns:
        gdf_copy = gdf_copy.drop(columns=['mapillary_feature'], errors='ignore')

    osm_data = {}
    original_indices = gdf_copy.index

    # --- Preserve Existing Tags (More Robust) ---
    essential_cols = ['geometry', 'source'] # Keep these separate
    # Identify potential existing tag columns (anything not essential or osm_tag/country_tag)
    # Explicitly exclude 'nodes' which is often a list from OSMnx
    other_cols = gdf_copy.columns.difference(essential_cols + osm_tag_cols + ['country_tag', 'nodes']) # Added 'nodes' exclusion

    for index, row in gdf_copy.iterrows():
        osm_data[index] = {}
        # Add existing non-tag columns as potential tags
        for col in other_cols:
            # Check if it's likely an attribute, not internal GeoPandas stuff
            if col not in ['level_0', 'index_right', 'orig_index_left', 'orig_index_right']: # Keep internal GPD excludes
                value_at_row_col = row[col] # Get value once

                # Check if the value is scalar BEFORE applying pd.notna or str()
                if np.isscalar(value_at_row_col):
                    # Now safe to treat as scalar
                    if pd.notna(value_at_row_col) and str(value_at_row_col) != '':
                        osm_data[index][col] = str(value_at_row_col)
                # Optional: Log if we skip a non-scalar value? Helps debugging.
                # else:
                #    if isinstance(value_at_row_col, (list, np.ndarray)):
                #       logging.debug(f"Skipping non-scalar (list/array) value in column '{col}' for index {index}")
                #    else:
                #       logging.debug(f"Skipping non-scalar type '{type(value_at_row_col)}' in column '{col}' for index {index}")

    # --- Process osm_tag_N columns ---
    for col in osm_tag_cols:
        if col in gdf_copy.columns:
            for index, row in gdf_copy.iterrows():
                if index not in osm_data: osm_data[index] = {}
                tag_value = row[col]
                # Make sure tag_value is scalar here too before splitting (less likely an issue here, but good practice)
                if np.isscalar(tag_value) and pd.notna(tag_value) and tag_value != '':
                    if isinstance(tag_value, str) and "=" in tag_value:
                        key, value = tag_value.split("=", 1)
                        # Check key/value validity after split
                        if pd.notna(key) and pd.notna(value) and key != '' and value != '':
                            if key not in osm_data[index]:
                                osm_data[index][key] = value
                            # Avoid duplication if value already exists in a semi-colon list
                            elif value not in osm_data[index][key].split(';'):
                                osm_data[index][key] = f"{osm_data[index][key]};{value}"
            # Don't drop columns here

    # --- Handle country_tag ---
    if 'country_tag' in gdf_copy.columns:
        for index, row in gdf_copy.iterrows():
            if index not in osm_data: osm_data[index] = {}
            country_tag_value = row['country_tag']
            # Ensure it's scalar and notna
            if np.isscalar(country_tag_value) and pd.notna(country_tag_value) and country_tag_value != '':
                # Extract value after 'traffic_sign=' if present
                value_part = country_tag_value.split("traffic_sign=", 1)[-1] # Takes last element, works if prefix exists or not
                if value_part: # Ensure extracted part is not empty
                   osm_data[index]['traffic_sign'] = value_part # Assign directly to 'traffic_sign' key
        # Don't drop column here

    processed_tags_df = pd.DataFrame.from_dict(osm_data, orient='index')

    # --- Combine essential columns with processed tags ---
    existing_essential_cols = [col for col in essential_cols if col in gdf_copy.columns]
    if not existing_essential_cols and processed_tags_df.empty:
        logging.warning("convert_osm_tags_to_kv: No essential columns or processed tags found.")
        return gpd.GeoDataFrame(geometry=[], crs=original_crs), []
    elif not existing_essential_cols:
        essential_df = gpd.GeoDataFrame(index=original_indices, crs=original_crs) # Create empty with index
    else:
         essential_df = gdf_copy.loc[original_indices, existing_essential_cols].copy()

    if processed_tags_df.empty:
        gdf_merged = essential_df # If no tags, result is just essential cols
    else:
        # Ensure index alignment before concatenation/merge
        processed_tags_df = processed_tags_df.reindex(essential_df.index)
        # Use concat which might be safer than merge for just adding columns
        gdf_merged = pd.concat([essential_df, processed_tags_df], axis=1)


    # --- Geometry Handling ---
    is_geo = False
    if 'geometry' in gdf_merged.columns:
         # Convert to GeoDataFrame if it's not already (might be DataFrame after concat)
         if not isinstance(gdf_merged, gpd.GeoDataFrame):
              try:
                  gdf_merged = gpd.GeoDataFrame(gdf_merged, geometry='geometry', crs=original_crs)
                  is_geo = True
              except Exception as gdf_err:
                  logging.warning(f"Could not create GeoDataFrame during conversion: {gdf_err}")
                  is_geo = False
         else: # It was already a GeoDataFrame
             is_geo = True
             # Ensure CRS consistency
             if gdf_merged.crs is None: gdf_merged = gdf_merged.set_crs(original_crs)
             elif gdf_merged.crs != original_crs: gdf_merged = gdf_merged.to_crs(original_crs)
             # Ensure active geometry
             if gdf_merged.geometry.name != 'geometry': gdf_merged = gdf_merged.set_geometry('geometry')


         # Validate and fix only if we successfully have a GeoDataFrame
         if is_geo and not gdf_merged.geometry.is_valid.all():
             logging.warning("Invalid geometries found. Attempting buffer(0)...")
             try:
                 original_geom_count = len(gdf_merged)
                 buffered_geoms = gdf_merged.geometry.buffer(0)
                 # Filter out empty geometries resulting from buffer(0)
                 gdf_merged = gdf_merged[~buffered_geoms.is_empty].copy() # Use copy to avoid SettingWithCopyWarning
                 if not gdf_merged.empty:
                      gdf_merged.geometry = gdf_merged.geometry.buffer(0) # Re-apply buffer on filtered subset if needed for validity
                      if not gdf_merged.geometry.is_valid.all():
                          logging.warning("buffer(0) failed to fix all. Dropping remaining invalid.")
                          gdf_merged = gdf_merged[gdf_merged.geometry.is_valid]
                      logging.info(f"Geometry fixing removed {original_geom_count - len(gdf_merged)} empty/invalid features.")
                 else:
                      logging.warning("All geometries became empty after buffer(0).")

                 # Ensure CRS is maintained
                 if not gdf_merged.empty and gdf_merged.crs != original_crs:
                      gdf_merged = gdf_merged.set_crs(original_crs)

             except Exception as e:
                  logging.error(f"Error during buffer(0): {e}. Attempting to keep valid only.")
                  gdf_merged = gdf_merged[gdf_merged.geometry.is_valid]

    # --- Final Cleanup ---
    cols_to_fill = gdf_merged.columns.difference(['geometry']) if is_geo and 'geometry' in gdf_merged.columns else gdf_merged.columns
    gdf_merged[cols_to_fill] = gdf_merged[cols_to_fill].fillna('')
    cols_to_drop_final = list(set(osm_tag_cols + ['country_tag']).intersection(gdf_merged.columns))
    gdf_merged = gdf_merged.drop(columns=cols_to_drop_final, errors='ignore')

    # Return correct type
    if is_geo and 'geometry' in gdf_merged.columns and not gdf_merged.empty:
         if not isinstance(gdf_merged, gpd.GeoDataFrame): # Final check
             try: gdf_merged = gpd.GeoDataFrame(gdf_merged, geometry='geometry', crs=original_crs)
             except: return pd.DataFrame(gdf_merged.drop(columns=['geometry'], errors='ignore')), []
         return gdf_merged, []
    elif 'geometry' in gdf_merged.columns: # Attempt final conversion
         try: return gpd.GeoDataFrame(gdf_merged, geometry='geometry', crs=original_crs), []
         except: return pd.DataFrame(gdf_merged.drop(columns=['geometry'], errors='ignore')), []
    else:
        return pd.DataFrame(gdf_merged), []

In [19]:
def get_area_from_input(user_input, city_id, manual_bounds_dir=os.path.join(INPUT_DATA_DIR, "manual_boundaries")):
    """
    Converts user input (place name or polygon) into a GeoDataFrame and its bounding box.
    PRIORITIZES loading a manual GeoJSON boundary file if it exists.

    Args:
        user_input (str): The original place name string (used for fallback geocoding).
        city_id (str): The sanitized city ID used for filename matching.
        manual_bounds_dir (str): The directory where manual boundary files are stored.

    Returns:
        tuple: (GeoDataFrame containing the boundary polygon, dictionary containing bbox)
               Returns (None, None) if area cannot be determined.
    """
    manual_geojson_path = os.path.join(manual_bounds_dir, f"{city_id}.geojson")
    logging.info(f"Checking for manual boundary file: {manual_geojson_path}")

    area_gdf = None
    polygon = None

    # --- Attempt 1: Load Manual GeoJSON ---
    try:
        if os.path.exists(manual_geojson_path):
            area_gdf = gpd.read_file(manual_geojson_path)
            if area_gdf.empty or 'geometry' not in area_gdf.columns:
                raise ValueError("Manual GeoJSON is empty or lacks geometry column.")
            if len(area_gdf) > 1:
                logging.warning(f"Manual GeoJSON {manual_geojson_path} has multiple features. Using the first one.")
                area_gdf = area_gdf.iloc[[0]].copy() # Select only the first feature

            # Ensure CRS is EPSG:4326 initially (like geocode_to_gdf) for consistency downstream
            if area_gdf.crs is None:
                 logging.warning(f"Manual GeoJSON {manual_geojson_path} has no CRS defined. Assuming EPSG:4326.")
                 area_gdf.set_crs("EPSG:4326", inplace=True)
            elif area_gdf.crs.to_epsg() != 4326:
                 logging.info(f"Projecting manual GeoJSON from {area_gdf.crs} to EPSG:4326.")
                 area_gdf = area_gdf.to_crs("EPSG:4326")

            # Validate geometry type (should be Polygon or MultiPolygon)
            geom_type = area_gdf.geometry.iloc[0].geom_type
            if not geom_type.endswith('Polygon'):
                 raise ValueError(f"Geometry in {manual_geojson_path} is not a Polygon/MultiPolygon ({geom_type}).")

            polygon = area_gdf.geometry.iloc[0]
            logging.info(f"Successfully loaded manual boundary from {manual_geojson_path}")

        else:
             raise FileNotFoundError # Explicitly raise if file doesn't exist to trigger fallback

    except (FileNotFoundError, DriverError, ValueError, Exception) as e_manual:
        if not isinstance(e_manual, FileNotFoundError): # Log errors other than file not found
             logging.warning(f"Could not load or validate manual boundary file {manual_geojson_path}: {e_manual}. Falling back to geocoding.")
        else:
             logging.info(f"Manual boundary file not found. Falling back to geocoding '{user_input}'.")

        # --- Attempt 2: Fallback to Geocoding ---
        try:
            logging.debug(f"Attempting ox.geocode_to_gdf for: '{user_input}'")
            area_gdf = ox.geocode_to_gdf(user_input)
            if area_gdf.empty or 'geometry' not in area_gdf.columns:
                 raise ValueError(f"ox.geocode_to_gdf returned empty or no geometry for '{user_input}'")
            # OSMnx usually returns 4326, but check just in case
            if area_gdf.crs is None: area_gdf.set_crs("EPSG:4326", inplace=True)
            elif area_gdf.crs.to_epsg() != 4326: area_gdf = area_gdf.to_crs("EPSG:4326")

            polygon = area_gdf.geometry.iloc[0] # Get the first (and usually only) polygon
            logging.info(f"Successfully geocoded boundary for '{user_input}'")

        except Exception as e_geocode:
            logging.error(f"Failed to determine area boundary for '{user_input}' using both manual file and geocoding: {e_geocode}", exc_info=True)
            return None, None # Indicate failure

    # --- Post-processing (applies to both manual and geocoded) ---
    # Rename conflicting columns potential added by GeoPandas/OSMnx internal ops
    if 'index_left' in area_gdf.columns:
        area_gdf = area_gdf.rename(columns={'index_left': 'orig_index_left'})
    if 'index_right' in area_gdf.columns:
        area_gdf = area_gdf.rename(columns={'index_right': 'orig_index_right'})

    # Extract bbox from the final polygon
    try:
        bbox = {
            'west': polygon.bounds[0],
            'south': polygon.bounds[1],
            'east': polygon.bounds[2],
            'north': polygon.bounds[3]
        }
    except Exception as e_bbox:
         logging.error(f"Failed to extract bounds from polygon for {city_id}: {e_bbox}")
         return None, None

    return area_gdf, bbox

In [20]:
def get_country_code(place_name, country_mappings):
    """Uses geopy to get the country code, with fallbacks, URL encoding, and place name simplification."""

    # 1. Try the dictionary first (case-insensitive lookup)
    lookup_key = place_name.lower().strip() # Use lowercase, stripped key
    logging.debug(f"Attempting dictionary lookup for key: '{lookup_key}'")
    country_code = place_to_country.get(lookup_key) # Use the processed key
    if country_code:
        logging.info(f"Found country code '{country_code}' in dictionary for '{place_name}'")
        return country_code
    logging.debug("Not found in dictionary.")

    # 2. If not in the dictionary, try geopy
    logging.info(f"Attempting geopy lookup for: '{place_name}'")
    try:
        # --- Simplification Loop ---
        parts = place_name.split(',')
        # Start with the most specific query and simplify
        for i in range(len(parts), 0, -1):
            simplified_place_name = ','.join(parts[:i]).strip()
            # Skip if simplified name is empty
            if not simplified_place_name:
                 continue

            encoded_simplified_name = urllib.parse.quote(simplified_place_name)
            logging.debug(f"Trying geopy with simplified name: '{simplified_place_name}' (Encoded: {encoded_simplified_name})")

            # --- Geocoding Attempts ---
            location = None
            try:
                 # Attempt 1: Direct geocode
                 logging.debug(f"  Querying Nominatim (direct): '{simplified_place_name}'")
                 location = geocode(simplified_place_name, timeout=10) # Use RateLimiter wrapper, add timeout
                 logging.debug(f"  Nominatim direct result: {location.raw if location else 'None'}")
                 if location: # Check if found before proceeding
                      address = location.raw.get('address', {})
                      cc = address.get('country_code')
                      if cc: logging.info(f"  Found country code '{cc.upper()}' via direct geocode."); return cc.upper()
            except Exception as e_geo1:
                 logging.warning(f"  Nominatim direct query failed for '{simplified_place_name}': {e_geo1}")

            # Attempt 2: Try with addressdetails=True if direct failed or didn't have code
            if not location or not location.raw.get('address', {}).get('country_code'):
                 try:
                      logging.debug(f"  Querying Nominatim (addressdetails): '{simplified_place_name}'")
                      location_details = geocode(simplified_place_name, addressdetails=True, timeout=10)
                      logging.debug(f"  Nominatim addressdetails result: {location_details.raw if location_details else 'None'}")
                      if location_details:
                           address_details = location_details.raw.get('address', {})
                           cc = address_details.get('country_code')
                           if cc: logging.info(f"  Found country code '{cc.upper()}' via addressdetails."); return cc.upper()
                 except Exception as e_geo2:
                      logging.warning(f"  Nominatim addressdetails query failed for '{simplified_place_name}': {e_geo2}")

        # If loop finishes without finding code
        print(f"Warning: Could not find country code for {place_name}") # Keep user-facing warning
        logging.warning(f"Failed to find country code for '{place_name}' after all attempts.") # Log internal warning
        return None

    except Exception as e:
        print(f"Error during geocoding: {e}") # Keep user-facing error
        logging.error(f"Exception during geocoding process for '{place_name}': {e}", exc_info=True) # Log internal error
        return None

# Section 4: Download and Process OSM Graph

In [21]:
def download_and_process_osm_graph(polygon):
    """
    Downloads the OSM graph for the specified polygon, projects it,
    and returns projected nodes, edges, and the projected graph object.
    """
    # Download more stuff than the standard ones from OSMnx
    ox.settings.useful_tags_way = {
        "lanes": True,
        "highway": True,
        "maxspeed": True,
        "cycleway": ["lane", "opposite_lane", "shared_lane", "share_busway", "opposite_share_busway"],
        "cycleway:both": ["lane", "shared_lane", "share_busway"],
        "cycleway:left": ["lane", "opposite_lane", "shared_lane", "share_busway", "opposite_share_busway"],
        "cycleway:right": ["lane", "opposite_lane", "shared_lane", "share_busway", "opposite_share_busway"],
        "busway": True,
        "busway:both": True,
        "busway:right": True,
        "busway:left": True,
        "lanes:bus": True,
        "lanes:psv": True
    }

    logging.info("Downloading OSM graph from polygon...")
    try:
        # Download graph using polygon
        G = ox.graph_from_polygon(polygon, network_type="drive", truncate_by_edge=True)
        logging.info("OSM graph download successful.")

        # --- Project the graph ---
        logging.debug(f"Projecting graph to CRS: {UTM_CRS}")
        G_projected = ox.project_graph(G, to_crs=UTM_CRS)
        logging.debug("Graph projection successful.")
        # ------------------------

    except Exception as e_graph_poly:
         logging.error(f"Failed to download or project graph from polygon: {e_graph_poly}", exc_info=True)
         # Return empty GDFs and None for the graph on failure
         return gpd.GeoDataFrame(), gpd.GeoDataFrame(), None # Add None for graph

    # Make undirected (optional, depends on analysis needs)
    G_projected = ox.utils_graph.get_undirected(G_projected)

    try:
        # --- Convert PROJECTED graph to GDFs ---
        logging.debug("Converting projected graph to GeoDataFrames...")
        # Pass projected graph to graph_to_gdfs
        gdf_nodes_proj, gdf_edges_proj = ox.graph_to_gdfs(G_projected)
        logging.debug("Graph conversion successful.")
        # ----------------------------------------

        # Reset index and ensure correct types AFTER conversion
        gdf_edges_proj = gdf_edges_proj.reset_index()
        if 'u' in gdf_edges_proj.columns: gdf_edges_proj['u'] = gdf_edges_proj['u'].astype(int)
        if 'v' in gdf_edges_proj.columns: gdf_edges_proj['v'] = gdf_edges_proj['v'].astype(int)

    except Exception as e_gdfs:
        logging.error(f"Failed to convert graph to GeoDataFrames: {e_gdfs}", exc_info=True)
        # Return empty GDFs and the potentially valid projected graph (or None if download failed)
        return gpd.GeoDataFrame(), gpd.GeoDataFrame(), G_projected if 'G_projected' in locals() else None

    # --- Modify return statement ---
    # Return projected nodes, projected edges, and the projected graph object
    return gdf_nodes_proj, gdf_edges_proj, G_projected
    # --- End modification ---

In [22]:
def process_way_tags(gdf_edges, combined_all):
    """
    Processes way tags, adds counts, and handles NaN values correctly.  ONLY for 'crossing'.
    """
    gdf_edges_buffered = gdf_edges.copy()
    gdf_edges_buffered.geometry = gdf_edges_buffered.geometry.buffer(BUFFER_DISTANCE)
    gdf_edges_buffered = gdf_edges_buffered.to_crs(UTM_CRS)
    combined_all = combined_all.to_crs(UTM_CRS)

    for item_key in way_tags:
        if item_key == 'crossing':  # Only process 'crossing' here
            if item_key not in combined_all.columns:
                logging.warning(f"Column '{item_key}' not found in combined_all. Skipping processing for this key.")
                # Ensure count columns are created with 0 if the key doesn't exist at all
                for item_value in way_tags[item_key]:
                     category_name = f"{item_key}_{item_value}" if item_value is not True else item_key
                     count_col_name = f"{category_name}_count"
                     if count_col_name not in gdf_edges.columns:
                          gdf_edges[count_col_name] = 0
                continue # Skip to the next item_key in way_tags

            for item_value in way_tags[item_key]:
                logging.info(f"Processing {item_key}={item_value}")
                category_name = f"{item_key}_{item_value}" if item_value is not True else item_key
                count_col_name = f"{category_name}_count" # Define count column name

                # Filter combined_all for the specific category
                gdf_category = combined_all[(combined_all[item_key] == item_value)]

                if not gdf_category.empty:
                    # Perform spatial join (ensure CRS match - already done above)
                    intersections = gpd.sjoin(gdf_category, gdf_edges, how="inner", predicate='intersects') # Use original gdf_edges here for u,v keys

                    # --- Deduplication Logic ---
                    unique_crossings = gpd.GeoDataFrame(columns=intersections.columns) # Initialize correctly
                    # Ensure correct dtypes if possible, though concat handles it later
                    for col, dtype in intersections.dtypes.items():
                         if col == 'geometry':
                             unique_crossings[col] = gpd.GeoSeries(dtype=dtype, crs=intersections.crs)
                         else:
                              unique_crossings[col] = pd.Series(dtype=dtype)

                    if not intersections.empty: # Only iterate if intersections exist
                        for index, row in intersections.iterrows():
                            crossing_geom = row['geometry']
                            # Ensure 'u' and 'v' exist in the row before accessing
                            if 'u' in row and 'v' in row:
                                u, v = row['u'], row['v']
                                # Check if unique_crossings is not empty before filtering
                                if not unique_crossings.empty and 'u' in unique_crossings.columns and 'v' in unique_crossings.columns:
                                    existing_crossing = unique_crossings[
                                        (unique_crossings['u'] == u) & (unique_crossings['v'] == v)
                                        & (unique_crossings.geometry.distance(crossing_geom) < 1e-6) # Use a small tolerance for float comparison
                                    ]
                                else:
                                    existing_crossing = gpd.GeoDataFrame() # Empty if unique_crossings is empty or lacks columns

                                if existing_crossing.empty:
                                    # Convert row to DataFrame before concat
                                    row_gdf = gpd.GeoDataFrame([row], crs=intersections.crs)
                                    unique_crossings = pd.concat([unique_crossings, row_gdf], ignore_index=True)
                            else:
                                logging.warning(f"Skipping intersection row due to missing 'u' or 'v': {row.get('osmid', 'N/A')}")
                        intersections = unique_crossings
                    # --- End Deduplication Logic ---

                    if not intersections.empty and 'u' in intersections.columns and 'v' in intersections.columns:
                         counts = intersections.groupby(['u', 'v']).size().reset_index(name=count_col_name)
                         # Ensure gdf_edges has 'u', 'v' before merging
                         if 'u' in gdf_edges.columns and 'v' in gdf_edges.columns:
                            gdf_edges = gdf_edges.merge(counts, on=['u', 'v'], how='left')
                            # Fill NaN AFTER merge, then convert to int
                            gdf_edges[count_col_name] = gdf_edges[count_col_name].fillna(0).astype(int)
                         else:
                              logging.warning(f"Cannot merge counts for {category_name} as 'u' or 'v' missing in gdf_edges.")
                              if count_col_name not in gdf_edges.columns: gdf_edges[count_col_name] = 0

                    else: # Handle case where intersections became empty or lacked u/v
                        if count_col_name not in gdf_edges.columns: gdf_edges[count_col_name] = 0
                        else: gdf_edges[count_col_name] = gdf_edges[count_col_name].fillna(0) # Fill existing column if intersections were empty

                else: # gdf_category was empty
                    if count_col_name not in gdf_edges.columns: gdf_edges[count_col_name] = 0
                    else: gdf_edges[count_col_name] = gdf_edges[count_col_name].fillna(0) # Ensure fillna even if category is empty

        else:
            # If other item_keys were added to way_tags, handle them or skip
            continue # Skip if it is highway (or any other key added later)

    # Final check to ensure all expected count columns exist and are int
    for item_key in way_tags:
         if item_key == 'crossing':
             for item_value in way_tags[item_key]:
                 category_name = f"{item_key}_{item_value}" if item_value is not True else item_key
                 count_col_name = f"{category_name}_count"
                 if count_col_name in gdf_edges.columns:
                      gdf_edges[count_col_name] = gdf_edges[count_col_name].fillna(0).astype(int)
                 else:
                      gdf_edges[count_col_name] = 0 # Create if completely missing

    return gdf_edges

In [23]:
def process_traffic_signs(gdf_edges, place, current_city_combined, all_city_mapillary_signs, city_id, data_dir, G_projected, boundary_polygon):
    """
    Processes traffic signs: loads existing combined file if possible, otherwise generates
    by loading/downloading OSM (Processed > Raw > Download using the boundary polygon),
    conflating with Mapillary, saving results, calculating counts, and processing highway counts.
    Uses the pre-downloaded and projected graph G_projected.
    Requires the original boundary_polygon used to create G_projected.
    """
    logging.debug(f"--- Starting process_traffic_signs for {city_id} ---")

    # --- Initialization and Graph Projection (Passed In) ---
    if G_projected is None:
        logging.error("Projected graph (G_projected) is None. Cannot process traffic signs.")
        # Initialize counts to 0 and exit if graph is None
        if 'traffic_sign_count' not in gdf_edges.columns: gdf_edges['traffic_sign_count'] = 0
        else: gdf_edges['traffic_sign_count'] = gdf_edges['traffic_sign_count'].fillna(0).astype(int)
        for item_value in way_tags.get('highway', []): # Ensure hwy counts init
            count_col_highway = f"highway_{item_value}_count"; gdf_edges[count_col_highway] = gdf_edges.get(count_col_highway, 0)
        return gdf_edges

    try: # Ensure edges are projected
        if gdf_edges.crs != UTM_CRS: gdf_edges_proj = gdf_edges.to_crs(UTM_CRS)
        else: gdf_edges_proj = gdf_edges.copy()
    except Exception as edge_proj_err:
        logging.error(f"Failed to project gdf_edges to UTM: {edge_proj_err}. Cannot proceed.", exc_info=True)
        return gdf_edges # Cannot proceed without projected edges

    base_crs = gdf_edges.crs # Original CRS of edges (or UTM if already projected)

    # --- Define Filepaths ---
    osm_traffic_signs_raw_filepath = os.path.join(data_dir, city_id, "raw", f"OSM_raw_{city_id}_traffic_signs.geojson")
    osm_traffic_signs_processed_filepath = os.path.join(data_dir, city_id, "OSM", f"OSM_{city_id}_traffic_signs.geojson")
    combined_traffic_signs_filepath = os.path.join(data_dir, city_id, "combined", f"combined_{city_id}_traffic_signs.geojson")
    missing_traffic_signs_filepath = os.path.join(data_dir, city_id, "missing", f"missing_{city_id}_traffic_signs.geojson")
    missing_no_country_tag_filepath = os.path.join(data_dir, city_id, "missing", f"missing_{city_id}_traffic_signs_no_country_tag.geojson")

    # --- Check if Combined Traffic Signs File Exists (Priority 1) ---
    signs_loaded = False
    combined_signs_for_counting = gpd.GeoDataFrame(crs=UTM_CRS) # Initialize empty GDF with target CRS

    try:
        logging.info(f"Checking for existing combined traffic signs file: {combined_traffic_signs_filepath}")
        combined_signs_for_counting = gpd.read_file(combined_traffic_signs_filepath)
        logging.info(f"LOADED {len(combined_signs_for_counting)} features from existing combined traffic signs file.")

        # Ensure loaded data is in UTM_CRS for counting
        if combined_signs_for_counting.crs != UTM_CRS:
            logging.debug(f"Projecting loaded combined signs from {combined_signs_for_counting.crs} to {UTM_CRS}")
            combined_signs_for_counting = combined_signs_for_counting.to_crs(UTM_CRS)

        signs_loaded = True
        if not os.path.exists(missing_traffic_signs_filepath):
             logging.warning(f"Combined traffic signs loaded, but missing file not found: {missing_traffic_signs_filepath}")
        if not os.path.exists(missing_no_country_tag_filepath):
             logging.warning(f"Combined traffic signs loaded, but missing (no country tag) file not found: {missing_no_country_tag_filepath}")

    except (FileNotFoundError, DriverError, Exception) as load_err:
        logging.info(f"Combined traffic signs file not found or failed to load ({type(load_err).__name__}). Will generate files.")
        signs_loaded = False
        combined_signs_for_counting = gpd.GeoDataFrame(crs=UTM_CRS) # Ensure it's reset

    # --- Generate Sign Files ONLY if Not Loaded ---
    if not signs_loaded:
        gdf_traffic_signs_processed = gpd.GeoDataFrame(geometry=[], crs=base_crs) # Initialize for generation path
        osm_data_source = "None" # Track source

        # --- 1. Acquire and Process OSM Traffic Signs (Processed > Raw > Download) ---
        try:
            # --- Attempt 1: Load Processed OSM ---
            logging.info(f"Attempting to load processed OSM traffic signs: {osm_traffic_signs_processed_filepath}")
            gdf_traffic_signs_processed = gpd.read_file(osm_traffic_signs_processed_filepath)
            # Ensure correct base_crs if loaded
            if gdf_traffic_signs_processed.crs != base_crs:
                 gdf_traffic_signs_processed = gdf_traffic_signs_processed.to_crs(base_crs)
            osm_data_source = "Processed"
            logging.info(f"LOADED {len(gdf_traffic_signs_processed)} features from processed OSM file.")

        except (FileNotFoundError, DriverError, Exception) as e1:
            logging.info(f"Processed OSM TS file fail ({type(e1).__name__}). Trying raw file.")
            try:
                # --- Attempt 2: Load Raw OSM ---
                logging.info(f"Attempting to load raw OSM traffic signs: {osm_traffic_signs_raw_filepath}")
                gdf_traffic_signs_raw = gpd.read_file(osm_traffic_signs_raw_filepath)
                osm_data_source = "Raw"
                logging.info(f"LOADED {len(gdf_traffic_signs_raw)} features from raw OSM file.")

                # --- Process Raw Data ---
                if not gdf_traffic_signs_raw.empty:
                    gdf_traffic_signs_processed = gdf_traffic_signs_raw.copy()
                    try: # Project to base_crs
                         if gdf_traffic_signs_processed.crs != base_crs:
                             gdf_traffic_signs_processed = gdf_traffic_signs_processed.to_crs(base_crs)
                    except Exception as crs_err:
                         logging.error(f"Failed proj raw OSM TS: {crs_err}")
                         gdf_traffic_signs_processed = gpd.GeoDataFrame(geometry=[], crs=base_crs) # Reset

                    if 'traffic_sign' in gdf_traffic_signs_processed.columns:
                         gdf_traffic_signs_processed = gdf_traffic_signs_processed[gdf_traffic_signs_processed['traffic_sign'].notna()]
                    else: # Tag missing
                         logging.warning("Column 'traffic_sign' not found in loaded raw OSM geometries.")
                         gdf_traffic_signs_processed = gpd.GeoDataFrame(geometry=[], crs=base_crs) # Reset

                    logging.info(f"Processed raw OSM data: {len(gdf_traffic_signs_processed)} features remain.")

                    # --- Save Processed (since generated from raw) ---
                    if not gdf_traffic_signs_processed.empty:
                         gdf_ts_proc_to_save = convert_lists_to_strings(gdf_traffic_signs_processed.copy())
                         cols_to_drop_osm_proc = ['element_type', 'osmid', 'nodes']
                         gdf_ts_proc_to_save = gdf_ts_proc_to_save.drop(columns=cols_to_drop_osm_proc, errors='ignore')
                         if not gdf_ts_proc_to_save.geometry.is_valid.all(): # Validation
                              try:
                                  orig_crs = gdf_ts_proc_to_save.crs
                                  gdf_ts_proc_to_save.geometry = gdf_ts_proc_to_save.geometry.buffer(0)
                                  gdf_ts_proc_to_save = gdf_ts_proc_to_save[~gdf_ts_proc_to_save.geometry.is_empty]
                                  if not gdf_ts_proc_to_save.empty and gdf_ts_proc_to_save.crs != orig_crs:
                                       gdf_ts_proc_to_save = gdf_ts_proc_to_save.set_crs(orig_crs, allow_override=True)
                              except Exception as buf_err: logging.error(f"Buffer error proc OSM (raw): {buf_err}")

                         if not gdf_ts_proc_to_save.empty:
                              try:
                                  gdf_ts_proc_to_save.to_file(osm_traffic_signs_processed_filepath, driver="GeoJSON")
                                  logging.info(f"Saved processed OSM TS file (from raw): {osm_traffic_signs_processed_filepath}")
                              except Exception as save_proc_err:
                                  logging.error(f"Failed save processed OSM TS (raw): {save_proc_err}")
                         else: # Buffer empty
                              empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=base_crs)
                              schema = create_schema(empty_gdf)
                              try:
                                  empty_gdf.to_file(osm_traffic_signs_processed_filepath, driver="GeoJSON", schema=schema)
                              except Exception as e:
                                  logging.error(f"Failed save empty proc OSM TS (raw, buffer): {e}")
                    else: # Processing raw empty
                         empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=base_crs)
                         schema = create_schema(empty_gdf)
                         try:
                             empty_gdf.to_file(osm_traffic_signs_processed_filepath, driver="GeoJSON", schema=schema)
                         except Exception as e:
                             logging.error(f"Failed save empty proc OSM TS (raw): {e}")
                else: # Raw file was empty
                     gdf_traffic_signs_processed = gpd.GeoDataFrame(geometry=[], crs=base_crs)
                     empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=base_crs)
                     schema = create_schema(empty_gdf)
                     try:
                         empty_gdf.to_file(osm_traffic_signs_processed_filepath, driver="GeoJSON", schema=schema)
                     except Exception as e:
                         logging.error(f"Failed save empty proc OSM TS (empty raw): {e}")

            except (FileNotFoundError, DriverError, Exception) as e2:
                logging.info(f"Raw OSM TS file fail ({type(e2).__name__}). Attempting download.")
                try:
                    # --- Attempt 3: Download OSM ---
                    # --- Check if boundary_polygon is valid ---
                    if boundary_polygon is None or not hasattr(boundary_polygon, 'geom_type') or not boundary_polygon.is_valid:
                        logging.error("Valid boundary_polygon is required for download but was not provided or is invalid.")
                        raise ValueError("Invalid boundary polygon for download.")
                    # -------------------------------------------

                    logging.info(f"Downloading OSM traffic signs using provided boundary polygon...")
                    gdf_traffic_signs_raw = ox.geometries_from_polygon(boundary_polygon, tags=traffic_sign_tags) # <-- USE boundary_polygon
                    logging.info(f"DOWNLOADED {len(gdf_traffic_signs_raw)} raw OSM traffic sign features.")
                    osm_data_source = "Download"

                    # --- Save Raw (since downloaded) ---
                    if not gdf_traffic_signs_raw.empty:
                        gdf_ts_raw_to_save = convert_lists_to_strings(gdf_traffic_signs_raw.copy())
                        try:
                            gdf_ts_raw_to_save.to_file(osm_traffic_signs_raw_filepath, driver="GeoJSON") # No schema
                        except Exception as save_raw_err:
                            logging.error(f"Failed save downloaded raw OSM TS: {save_raw_err}")
                    else: # Save empty raw if download empty
                         empty_gdf_ts_raw_d = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs='EPSG:4326'); schema_ts_raw_d = create_schema(empty_gdf_ts_raw_d)
                         try:
                             empty_gdf_ts_raw_d.to_file(osm_traffic_signs_raw_filepath, driver="GeoJSON", schema=schema_ts_raw_d)
                         except Exception as e:
                             logging.error(f"Failed save empty raw OSM TS (download empty): {e}")

                    # --- Process Downloaded Data ---
                    if not gdf_traffic_signs_raw.empty:
                        gdf_traffic_signs_processed = gdf_traffic_signs_raw.copy()
                        try: # Project
                             if gdf_traffic_signs_processed.crs != base_crs:
                                 gdf_traffic_signs_processed = gdf_traffic_signs_processed.to_crs(base_crs)
                        except Exception as crs_err_d:
                             logging.error(f"Failed proj downloaded OSM TS: {crs_err_d}")
                             gdf_traffic_signs_processed = gpd.GeoDataFrame(geometry=[], crs=base_crs) # Reset
                        # Filter
                        if 'traffic_sign' in gdf_traffic_signs_processed.columns:
                             gdf_traffic_signs_processed = gdf_traffic_signs_processed[gdf_traffic_signs_processed['traffic_sign'].notna()]
                        else:
                             logging.warning("Column 'traffic_sign' not found in downloaded OSM geometries.")
                             gdf_traffic_signs_processed = gpd.GeoDataFrame(geometry=[], crs=base_crs) # Reset
                        logging.info(f"Processed downloaded OSM data: {len(gdf_traffic_signs_processed)} features remain.")
                    else: # Download was empty
                         gdf_traffic_signs_processed = gpd.GeoDataFrame(geometry=[], crs=base_crs)

                    # --- Save Processed (since generated from download) ---
                    if not gdf_traffic_signs_processed.empty:
                         gdf_ts_proc_to_save = convert_lists_to_strings(gdf_traffic_signs_processed.copy())
                         cols_to_drop_osm_proc = ['element_type', 'osmid', 'nodes']
                         gdf_ts_proc_to_save = gdf_ts_proc_to_save.drop(columns=cols_to_drop_osm_proc, errors='ignore')
                         # Validation buffer(0) logic
                         if not gdf_ts_proc_to_save.geometry.is_valid.all():
                              try:
                                   orig_crs = gdf_ts_proc_to_save.crs
                                   gdf_ts_proc_to_save.geometry = gdf_ts_proc_to_save.geometry.buffer(0)
                                   gdf_ts_proc_to_save = gdf_ts_proc_to_save[~gdf_ts_proc_to_save.geometry.is_empty]
                                   if not gdf_ts_proc_to_save.empty and gdf_ts_proc_to_save.crs != orig_crs:
                                        gdf_ts_proc_to_save = gdf_ts_proc_to_save.set_crs(orig_crs, allow_override=True)
                              except Exception as buf_err: logging.error(f"Buffer error proc OSM (download): {buf_err}")

                         if not gdf_ts_proc_to_save.empty:
                              try:
                                  gdf_ts_proc_to_save.to_file(osm_traffic_signs_processed_filepath, driver="GeoJSON")
                                  logging.info(f"Saved processed OSM TS file (from download): {osm_traffic_signs_processed_filepath}")
                              except Exception as save_proc_err_d:
                                  logging.error(f"Failed save processed OSM TS (download): {save_proc_err_d}")
                         else: # Buffer empty
                              empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=base_crs)
                              schema = create_schema(empty_gdf)
                              try:
                                  empty_gdf.to_file(osm_traffic_signs_processed_filepath, driver="GeoJSON", schema=schema)
                              except Exception as e:
                                  logging.error(f"Failed save empty proc OSM TS (download, buffer): {e}")
                    else: # Process download empty
                         empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=base_crs)
                         schema = create_schema(empty_gdf)
                         try:
                             empty_gdf.to_file(osm_traffic_signs_processed_filepath, driver="GeoJSON", schema=schema)
                         except Exception as e:
                             logging.error(f"Failed save empty proc OSM TS (download): {e}")

                except (ox._errors.InsufficientResponseError, ValueError, Exception) as e3: # Catch ValueError for polygon issue
                    logging.error(f"OSM traffic sign download failed: {e3}", exc_info=True)
                    gdf_traffic_signs_processed = gpd.GeoDataFrame(geometry=[], crs=base_crs) # Ensure empty
                    # Save empty files if they don't exist
                    if not os.path.exists(osm_traffic_signs_raw_filepath):
                         empty_gdf_ts_raw_err = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs='EPSG:4326'); schema_ts_raw_err = create_schema(empty_gdf_ts_raw_err)
                         try: empty_gdf_ts_raw_err.to_file(osm_traffic_signs_raw_filepath, driver="GeoJSON", schema=schema_ts_raw_err)
                         except Exception as e_save: logging.error(f"Failed save empty raw OSM TS on download error: {e_save}")
                    if not os.path.exists(osm_traffic_signs_processed_filepath):
                         empty_gdf_ts_proc_err = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=base_crs); schema_ts_proc_err = create_schema(empty_gdf_ts_proc_err)
                         try: empty_gdf_ts_proc_err.to_file(osm_traffic_signs_processed_filepath, driver="GeoJSON", schema=schema_ts_proc_err)
                         except Exception as e_save: logging.error(f"Failed save empty processed OSM TS on download error: {e_save}")

        # --- End OSM Data Acquisition ---

        # --- 1.5 Conflate OSM and Mapillary Traffic Signs ---
        logging.info(f"--- Starting Traffic Sign Conflation (using OSM from {osm_data_source}) ---")
        combined_signs = gpd.GeoDataFrame(crs=UTM_CRS); missing_signs = gpd.GeoDataFrame(crs=UTM_CRS)
        osm_signs_proj = gpd.GeoDataFrame(crs=UTM_CRS); mply_signs_proj = gpd.GeoDataFrame(crs=UTM_CRS)

        # --- Prepare OSM signs ---
        if isinstance(gdf_traffic_signs_processed, gpd.GeoDataFrame) and not gdf_traffic_signs_processed.empty:
             try:
                  if gdf_traffic_signs_processed.crs != UTM_CRS: osm_signs_proj = gdf_traffic_signs_processed.to_crs(UTM_CRS)
                  else: osm_signs_proj = gdf_traffic_signs_processed.copy()
                  osm_signs_proj['source'] = 'OSM'
                  if 'traffic_sign' not in osm_signs_proj.columns: osm_signs_proj['traffic_sign'] = ''
                  else: osm_signs_proj['traffic_sign'] = osm_signs_proj['traffic_sign'].astype(str).str.strip().replace(['nan', 'None', 'NULL'], '', regex=False)
                  logging.debug(f"CONFLATION - Prepared {len(osm_signs_proj)} OSM signs.")
             except Exception as osm_proj_err: logging.error(f"CONFLATION - Failed prep OSM signs: {osm_proj_err}")

        # --- Prepare Mapillary signs ---
        if isinstance(all_city_mapillary_signs, gpd.GeoDataFrame) and not all_city_mapillary_signs.empty:
             try:
                  if all_city_mapillary_signs.crs != UTM_CRS: mply_signs_proj = all_city_mapillary_signs.to_crs(UTM_CRS)
                  else: mply_signs_proj = all_city_mapillary_signs.copy()
                  if 'source' not in mply_signs_proj.columns: mply_signs_proj['source'] = 'Mapillary'

                  # --- Tag Extraction Logic ---
                  logging.debug("CONFLATION - Determining 'traffic_sign' tag for Mapillary...")
                  if 'traffic_sign' not in mply_signs_proj.columns: mply_signs_proj['traffic_sign'] = ''
                  mply_signs_proj['traffic_sign'] = mply_signs_proj['traffic_sign'].astype(str).str.strip().replace(['nan', 'None', 'NULL'], '', regex=False)
                  needs_derivation = mply_signs_proj['traffic_sign'] == ''

                  if needs_derivation.any() and 'country_tag' in mply_signs_proj.columns:
                       logging.debug("CONFLATION - Trying 'country_tag'...")
                       split_tags = mply_signs_proj.loc[needs_derivation, 'country_tag'].astype(str).str.split('traffic_sign=', n=1, expand=True)
                       derived_country = split_tags[1].where(split_tags[1].notna(), split_tags[0]).str.strip().replace(['nan', 'None', 'NULL'], '', regex=False)
                       mply_signs_proj.loc[needs_derivation, 'traffic_sign'] = mply_signs_proj.loc[needs_derivation, 'traffic_sign'].replace('', np.nan).fillna(derived_country)
                       needs_derivation = mply_signs_proj['traffic_sign'] == ''

                  if needs_derivation.any() and 'osm_tag_1' in mply_signs_proj.columns:
                       logging.debug("CONFLATION - Trying 'osm_tag_1'...")
                       looks_like_kv = mply_signs_proj.loc[needs_derivation, 'osm_tag_1'].astype(str).str.contains('=', na=False)
                       if looks_like_kv.any():
                           split_tags = mply_signs_proj.loc[needs_derivation & looks_like_kv, 'osm_tag_1'].astype(str).str.split('=', n=1, expand=True)
                           derived_osm = split_tags[1].where(split_tags[1].notna(), split_tags[0]).str.strip().replace(['nan', 'None', 'NULL'], '', regex=False)
                           mply_signs_proj.loc[needs_derivation & looks_like_kv, 'traffic_sign'] = mply_signs_proj.loc[needs_derivation & looks_like_kv, 'traffic_sign'].replace('', np.nan).fillna(derived_osm)
                       needs_derivation = mply_signs_proj['traffic_sign'] == ''

                  if needs_derivation.any() and 'mapillary_feature' in mply_signs_proj.columns:
                       logging.debug("CONFLATION - Falling back to 'mapillary_feature'...")
                       derived_raw = mply_signs_proj.loc[needs_derivation, 'mapillary_feature'].astype(str).str.strip().replace(['nan', 'None', 'NULL'], '', regex=False)
                       mply_signs_proj.loc[needs_derivation, 'traffic_sign'] = mply_signs_proj.loc[needs_derivation, 'traffic_sign'].replace('', np.nan).fillna(derived_raw)

                  mply_signs_proj['traffic_sign'] = mply_signs_proj['traffic_sign'].fillna('').astype(str)
                  # --- End Tag Extraction Logic ---
                  logging.debug(f"CONFLATION - Prepared {len(mply_signs_proj)} Mapillary signs.")
                  if not mply_signs_proj.empty: logging.debug(f"CONFLATION - Unique Mapillary 'traffic_sign' tags after derivation: {mply_signs_proj['traffic_sign'].unique()}")

             except Exception as mply_prep_err: logging.error(f"CONFLATION - Failed prep Mapillary: {mply_prep_err}"); mply_signs_proj = gpd.GeoDataFrame(crs=UTM_CRS)

        # --- Perform conflation ---
        if not osm_signs_proj.empty and not mply_signs_proj.empty:
             # (Buffer, sjoin, loop/compare tags, build matched_mply_indices)
             osm_buffered = osm_signs_proj[osm_signs_proj.geometry.is_valid].copy()
             mply_buffered = mply_signs_proj[mply_signs_proj.geometry.is_valid].copy()
             if osm_buffered.empty or mply_buffered.empty:
                 combined_signs = pd.concat([osm_buffered, mply_buffered], ignore_index=True, sort=False)
                 combined_signs = gpd.GeoDataFrame(combined_signs, geometry='geometry', crs=UTM_CRS)
                 missing_signs = mply_buffered.copy() if not osm_buffered.empty else gpd.GeoDataFrame(crs=UTM_CRS)
             else:
                 osm_buffered['geometry'] = osm_buffered.geometry.buffer(BUFFER_DISTANCE); mply_buffered['geometry'] = mply_buffered.geometry.buffer(BUFFER_DISTANCE)
                 possible_matches = gpd.sjoin(mply_buffered, osm_buffered, how='inner', predicate='intersects', lsuffix='mply', rsuffix='osm')
                 matched_mply_indices = set(); osm_index_col = 'index_osm' if 'index_osm' in possible_matches.columns else 'index_right'
                 missing_mply_indices = mply_signs_proj.index
                 if osm_index_col not in possible_matches.columns and not possible_matches.empty: pass
                 elif not possible_matches.empty:
                     for mply_idx, group in possible_matches.groupby(possible_matches.index):
                          if mply_idx in matched_mply_indices or mply_idx not in mply_signs_proj.index: continue
                          mply_sign_tag = mply_signs_proj.loc[mply_idx, 'traffic_sign']
                          if not mply_sign_tag: continue
                          for osm_match_idx in group[osm_index_col].unique():
                               if osm_match_idx not in osm_signs_proj.index: continue
                               osm_sign_tag = osm_signs_proj.loc[osm_match_idx, 'traffic_sign']
                               if not osm_sign_tag:
                                   continue
                               if mply_sign_tag.lower() == osm_sign_tag.lower():
                                   matched_mply_indices.add(mply_idx); break
                     missing_mply_indices = mply_signs_proj.index.difference(matched_mply_indices)
                 missing_signs = mply_signs_proj.loc[missing_mply_indices].copy()
                 try:
                     combined_signs = pd.concat([osm_signs_proj, missing_signs], ignore_index=True, sort=False)
                     combined_signs = gpd.GeoDataFrame(combined_signs, geometry='geometry', crs=UTM_CRS)
                 except Exception as concat_err:
                     combined_signs = osm_signs_proj.copy() # Fallback
        elif not osm_signs_proj.empty:
            combined_signs = osm_signs_proj.copy()
            missing_signs = gpd.GeoDataFrame(crs=UTM_CRS)
        elif not mply_signs_proj.empty:
            combined_signs = mply_signs_proj.copy()
            missing_signs = mply_signs_proj.copy()
        else:
            combined_signs = gpd.GeoDataFrame(crs=UTM_CRS)
            missing_signs = gpd.GeoDataFrame(crs=UTM_CRS)


        # --- Save Conflation Results (Apply Tag Processing Before Save) ---
        cols_to_drop_save = ['index_mply', 'index_osm', 'index_right', 'index_left', 'geom_wkt_temp', 'element_type', 'osmid', 'nodes'] + [f'osm_tag_{i}_country' for i in range(1,6)] + [f'traffic_sign_{suffix}' for suffix in ['mply','osm','left','right']]

        # Save Combined Signs
        logging.info(f"Saving {len(combined_signs)} combined traffic signs to: {combined_traffic_signs_filepath}")
        if not combined_signs.empty:
            combined_signs_processed_tags, _ = convert_osm_tags_to_kv(combined_signs.copy())
            if isinstance(combined_signs_processed_tags, gpd.GeoDataFrame) and not combined_signs_processed_tags.empty:
                combined_signs_to_save = convert_lists_to_strings(combined_signs_processed_tags).drop(columns=cols_to_drop_save, errors='ignore')
                try: combined_signs_to_save.to_file(combined_traffic_signs_filepath, driver="GeoJSON")
                except Exception as save_comb_ts_err: logging.error(f"!!! Failed save combined TS: {save_comb_ts_err}")
            else: # Tag fail
                 empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                 schema = create_schema(empty_gdf)
                 try:
                     empty_gdf.to_file(combined_traffic_signs_filepath, driver="GeoJSON", schema=schema)
                 except Exception as e:
                     pass
        else: # Empty combined
             empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
             schema = create_schema(empty_gdf)
             try:
                 empty_gdf.to_file(combined_traffic_signs_filepath, driver="GeoJSON", schema=schema)
             except Exception as e:
                 pass

        # --- FILTER and Save Missing Signs ---
        missing_signs_filtered = gpd.GeoDataFrame(crs=UTM_CRS); missing_signs_no_country_tag = gpd.GeoDataFrame(crs=UTM_CRS)
        if not missing_signs.empty:
            if 'country_tag' in missing_signs.columns:
                condition_not_na = missing_signs['country_tag'].notna()
                condition_not_empty_or_null_str = missing_signs['country_tag'].astype(str).str.strip().isin(['', 'nan', 'None', 'NULL']) == False
                rows_to_keep = condition_not_na & condition_not_empty_or_null_str; rows_to_discard = ~rows_to_keep
                missing_signs_filtered = missing_signs[rows_to_keep].copy(); missing_signs_no_country_tag = missing_signs[rows_to_discard].copy()
            else: missing_signs_no_country_tag = missing_signs.copy(); missing_signs_filtered = gpd.GeoDataFrame(crs=UTM_CRS)
        else: pass # Both remain empty

        # Save Filtered Missing Signs (WITH country tag)
        logging.info(f"Saving {len(missing_signs_filtered)} filtered missing traffic signs to: {missing_traffic_signs_filepath}")
        if not missing_signs_filtered.empty:
            missing_signs_processed_tags, _ = convert_osm_tags_to_kv(missing_signs_filtered.copy())
            if isinstance(missing_signs_processed_tags, gpd.GeoDataFrame) and not missing_signs_processed_tags.empty:
                missing_signs_to_save = convert_lists_to_strings(missing_signs_processed_tags).drop(columns=cols_to_drop_save, errors='ignore')
                try:
                    missing_signs_to_save.to_file(missing_traffic_signs_filepath, driver="GeoJSON")
                except Exception as save_miss_ts_err:
                    logging.error(f"!!! Failed save filtered missing TS: {save_miss_ts_err}")
            else: # Tag fail
                 empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                 schema = create_schema(empty_gdf)
                 try:
                     empty_gdf.to_file(missing_traffic_signs_filepath, driver="GeoJSON", schema=schema)
                 except Exception as e:
                     pass
        else: # Empty filtered
             empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
             schema = create_schema(empty_gdf)
             try:
                 empty_gdf.to_file(missing_traffic_signs_filepath, driver="GeoJSON", schema=schema)
             except Exception as e:
                 pass

        # Save Missing Signs WITHOUT country tag
        logging.info(f"Saving {len(missing_signs_no_country_tag)} missing signs without country tag to: {missing_no_country_tag_filepath}")
        if not missing_signs_no_country_tag.empty:
            missing_no_ct_processed_tags, _ = convert_osm_tags_to_kv(missing_signs_no_country_tag.copy())
            if isinstance(missing_no_ct_processed_tags, gpd.GeoDataFrame) and not missing_no_ct_processed_tags.empty:
                missing_no_ct_to_save = convert_lists_to_strings(missing_no_ct_processed_tags).drop(columns=cols_to_drop_save, errors='ignore')
                try: missing_no_ct_to_save.to_file(missing_no_country_tag_filepath, driver="GeoJSON")
                except Exception as save_miss_no_ct_err: logging.error(f"!!! Failed save missing TS (no country tag): {save_miss_no_ct_err}")
            else: # Tag fail
                empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                schema = create_schema(empty_gdf)
                try:
                    empty_gdf.to_file(missing_no_country_tag_filepath, driver="GeoJSON", schema=schema)
                except Exception as e:
                    pass
        else: # Empty no country tag
            empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
            schema = create_schema(empty_gdf)
            try:
                empty_gdf.to_file(missing_no_country_tag_filepath, driver="GeoJSON", schema=schema)
            except Exception as e:
                pass
        # --- End Conflation Save Section ---

        # Assign the FULL combined signs (tag processed) for counting
        if 'combined_signs_processed_tags' in locals() and isinstance(locals().get('combined_signs_processed_tags'), gpd.GeoDataFrame) and not locals().get('combined_signs_processed_tags').empty:
             combined_signs_for_counting = locals().get('combined_signs_processed_tags').copy()
             logging.debug("Using newly generated & tag-processed combined signs for counting.")
        elif isinstance(combined_signs, gpd.GeoDataFrame) and not combined_signs.empty:
             combined_signs_for_counting = combined_signs.copy() # Fallback if tag processing failed
             logging.warning("Using newly generated combined signs *before* tag processing for counting.")
        else:
             combined_signs_for_counting = gpd.GeoDataFrame(crs=UTM_CRS) # Ensure empty if generation failed

    # --- End File Generation Block ('if not signs_loaded:') ---


    # --- 2. Calculate Counts from Combined Traffic Signs ---
    logging.info("Calculating final traffic sign counts from combined data...")
    # Initialize count column safely
    if 'traffic_sign_count' not in gdf_edges.columns: gdf_edges['traffic_sign_count'] = 0
    else: gdf_edges['traffic_sign_count'] = gdf_edges['traffic_sign_count'].fillna(0)

    if not combined_signs_for_counting.empty and isinstance(combined_signs_for_counting, gpd.GeoDataFrame) and 'geometry' in combined_signs_for_counting.columns:
        combined_signs_points = combined_signs_for_counting[combined_signs_for_counting.geometry.geom_type == 'Point'].copy()
        if not combined_signs_points.empty:
             if combined_signs_points.crs != UTM_CRS: # Project points if needed
                 try:
                     combined_signs_points = combined_signs_points.to_crs(UTM_CRS)
                 except Exception as crs_err_pts:
                     combined_signs_points = gpd.GeoDataFrame()

             if not combined_signs_points.empty: # Check again after projection
                 centroids_comb = combined_signs_points.geometry
                 gdf_nearest_edges_comb = ox.distance.nearest_edges(G_projected, X=centroids_comb.x, Y=centroids_comb.y, return_dist=False)
                 final_sign_counts = []
                 if 'traffic_sign' not in combined_signs_points.columns: combined_signs_points['traffic_sign'] = ''
                 for idx in combined_signs_points.index:
                    sign_value = combined_signs_points.loc[idx, 'traffic_sign']
                    sign_count = 1 + (sign_value.count(';') if isinstance(sign_value, str) and sign_value else 0)
                    final_sign_counts.append(sign_count)
                 normalized_edges_comb = [(min(u, v), max(u, v)) for u, v, _ in gdf_nearest_edges_comb]
                 if len(final_sign_counts) == len(normalized_edges_comb):
                    final_intersections = pd.DataFrame({'u': [u for u,v in normalized_edges_comb], 'v': [v for u,v in normalized_edges_comb], 'final_sign_count': final_sign_counts})
                    if not final_intersections.empty:
                        final_counts_agg = final_intersections.groupby(['u', 'v'])['final_sign_count'].sum().reset_index(name="traffic_sign_count_new")
                        gdf_edges = gdf_edges.merge(final_counts_agg, on=['u', 'v'], how='left')
                        gdf_edges['traffic_sign_count'] = gdf_edges['traffic_sign_count_new'].fillna(0) # Overwrite
                        gdf_edges = gdf_edges.drop(columns=['traffic_sign_count_new']) # Drop temp
                        logging.info(f"Final combined traffic sign counts merged ({int(gdf_edges['traffic_sign_count'].sum())} total signs).")
                    else: gdf_edges['traffic_sign_count'] = 0; logging.info("No combined TS intersections.") # Overwrite
                 else: logging.warning(f"Length mismatch counts/edges. Counts not updated."); gdf_edges['traffic_sign_count'] = gdf_edges['traffic_sign_count'].fillna(0)
             else: logging.warning("Combined points empty after proj. Counts not updated."); gdf_edges['traffic_sign_count'] = gdf_edges['traffic_sign_count'].fillna(0)
        else: logging.info("No Point geoms in combined signs. Counts not updated."); gdf_edges['traffic_sign_count'] = gdf_edges['traffic_sign_count'].fillna(0)
    else: logging.warning("Combined signs data empty/invalid for counting. Counts not updated."); gdf_edges['traffic_sign_count'] = gdf_edges['traffic_sign_count'].fillna(0)
    # Final assurance: ensure column exists and is integer
    gdf_edges['traffic_sign_count'] = gdf_edges.get('traffic_sign_count', 0).fillna(0).astype(int)
    # --- End Revised Count Calculation ---


    # --- 3. Process 'highway' Counts (using current_city_combined) ---
    logging.info("Processing highway feature counts")
    highway_tags_to_process = way_tags.get('highway', [])
    processed_hwy_input = gpd.GeoDataFrame() # Initialize
    if isinstance(current_city_combined, gpd.GeoDataFrame) and not current_city_combined.empty:
        if current_city_combined.crs != UTM_CRS:
             try: processed_hwy_input = current_city_combined.to_crs(UTM_CRS)
             except Exception as crs_err_hwy: logging.error(f"Failed proj highway input: {crs_err_hwy}")
        else: processed_hwy_input = current_city_combined.copy()
    else: logging.warning("current_city_combined invalid/empty for highway counts.")

    if not processed_hwy_input.empty and 'highway' in processed_hwy_input.columns and highway_tags_to_process:
        # Use processed_hwy_input which is guaranteed to be in UTM_CRS if not empty
        for item_value in highway_tags_to_process:
            category_name = f"highway_{item_value}"; count_col_name = f"{category_name}_count"
            if count_col_name not in gdf_edges.columns: gdf_edges[count_col_name] = 0
            gdf_category = processed_hwy_input[processed_hwy_input['highway'] == item_value]
            if not gdf_category.empty:
                 gdf_category_buffered = gdf_category.copy()
                 if (gdf_category_buffered.geom_type == 'Point').any(): gdf_category_buffered.geometry = gdf_category_buffered.buffer(0.1) # Small buffer
                 intersections = gpd.sjoin(gdf_category_buffered, gdf_edges_proj, how="inner", predicate='intersects') # Use projected edges
                 if not intersections.empty:
                     counts = intersections.groupby(['u', 'v']).size().reset_index(name=f"{count_col_name}_new")
                     gdf_edges = gdf_edges.merge(counts, on=['u', 'v'], how='left')
                     gdf_edges[count_col_name] = gdf_edges[count_col_name].fillna(0) + gdf_edges[f"{count_col_name}_new"].fillna(0)
                     gdf_edges = gdf_edges.drop(columns=[f"{count_col_name}_new"])
                     gdf_edges[count_col_name] = gdf_edges[count_col_name].astype(int)
                     logging.debug(f"Highway counts for {category_name} added.")
                 else: gdf_edges[count_col_name] = gdf_edges[count_col_name].fillna(0).astype(int) # Ensure 0 if no intersections
            else: gdf_edges[count_col_name] = gdf_edges[count_col_name].fillna(0).astype(int) # Ensure 0 if no category data
    else:
         logging.warning("Skipping highway counts: Input data invalid/empty, 'highway' missing, or no tags defined.")
         for item_value in highway_tags_to_process: gdf_edges[f"highway_{item_value}_count"] = gdf_edges.get(f"highway_{item_value}_count", 0)


    # --- Final CRS Check ---
    if gdf_edges.crs != UTM_CRS:
        try: gdf_edges = gdf_edges.to_crs(UTM_CRS)
        except Exception as final_crs_err: logging.error(f"Failed final CRS conversion to UTM: {final_crs_err}")

    logging.info("Finished processing traffic signs (conflated) and highway counts.")
    return gdf_edges

In [24]:
def save_processed_network(gdf_edges, filepath):
    """Saves the processed network GeoDataFrame to a file after converting list-type columns to strings."""
    list_columns = [col for col in gdf_edges.columns if any(isinstance(val, list) for val in gdf_edges[col])]
    for col in list_columns:
        gdf_edges[col] = gdf_edges[col].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
    gdf_edges.to_file(filepath, driver="GPKG")
    print(f"Processed network saved to '{filepath}'.")

In [25]:
def process_cycleways_and_busways(gdf_edges):
    """
    Processes cycleways and busways in the OSM data.
    """

    gdf_edges['cycle_lane'] = 'no'
    gdf_edges['shared_cycle'] = 'no'
    gdf_edges['bus_lane'] = 'no'

    for cycleway_col in ['cycleway', 'cycleway:left', 'cycleway:right', 'cycleway:both']:
        if cycleway_col in gdf_edges.columns:
            gdf_edges['cycle_lane'] = np.where(gdf_edges[cycleway_col].isin(['lane', 'opposite_lane']), 'yes', gdf_edges['cycle_lane'])
            gdf_edges['shared_cycle'] = np.where(gdf_edges[cycleway_col].isin(['shared_lane', 'share_busway', 'opposite_share_busway']), 'yes', gdf_edges['shared_cycle'])

    for busway_col in ['busway', 'busway:left', 'busway:right', 'busway:both', 'lanes:bus', 'lanes:psv']:
        if busway_col in gdf_edges.columns:
            # Force string conversion and stripping of whitespace
            gdf_edges[busway_col] = gdf_edges[busway_col].astype(str).str.strip()

            if pd.api.types.is_numeric_dtype(gdf_edges[busway_col]):
                gdf_edges['bus_lane'] = np.where(
                    (gdf_edges[busway_col].notna()) & (gdf_edges[busway_col] != '0'), # Compare as strings
                    'yes',
                    gdf_edges['bus_lane']
                )
            else:
                # Explicitly handle common string values, including 'nan'
                gdf_edges['bus_lane'] = np.where(
                    gdf_edges[busway_col].isin(['yes', 'designated', 'official', 'permissive', '1', 'lane']), # Expanded list
                    'yes',
                    gdf_edges['bus_lane']
                )


    gdf_edges['bus_lane'] = gdf_edges['bus_lane'].fillna('no') # Keep

    return gdf_edges

# Main Function

In [26]:
def main(user_inputs):
    global transport_agency_data_exists, transport_agency_osm_df # Declare globals

    # --- Initialize ---
    all_gdf_edges = [] # List to track which cities were successfully processed
    os.makedirs(RESULTS_DIR, exist_ok=True) # Ensure results dir exists
    city_processing_log_file = os.path.join(RESULTS_DIR, "city_processing_times.csv")
    city_times_list = []

    # --- Load Mappings ---
    mapillary_osm_mapping, country_mappings = load_mappings()

    # --- Load Existing Times Log ---
    processed_city_ids = set()
    if os.path.exists(city_processing_log_file):
        try:
            existing_times_df = pd.read_csv(city_processing_log_file, sep=';')
            city_times_list = existing_times_df.to_dict('records')
            processed_city_ids = set(existing_times_df['city_id'])
            logging.info(f"Loaded existing processing times for {len(processed_city_ids)} cities.")
        except Exception as e_load_times:
             logging.warning(f"Could not load existing city times log: {e_load_times}")
             processed_city_ids = set() # Ensure it's a set even if loading fails
    # ----------------------------------------

    overall_start_time = time.time()
    # data_dir is already defined globally as "data"

    logging.info("Starting processing...")

    if transport_agency_data_exists:
        logging.info("Preprocessing transport agency data...")
        agency_gdf_processed, _ = convert_osm_tags_to_kv(transport_agency_osm_df)
        if isinstance(agency_gdf_processed, gpd.GeoDataFrame):
                agency_gdf_processed['source'] = 'transit_agency'
                transport_agency_osm_df = agency_gdf_processed # Update the global variable
                logging.info("Transport agency data preprocessed.")
        else:
                logging.warning("Agency data preprocessing failed.")
                transport_agency_data_exists = False

    os.makedirs(DATA_DIR, exist_ok=True) # Ensure data dir exists

    for user_input in user_inputs:
        logging.info(f"\n=== Processing City: {user_input} ===")
        start_city_time = time.time()
        city_id = sanitize_filename(user_input)

        current_city_combined_list = []
        city_mapillary_signs_processed_list = []
        processed_mapillary_filters = set()

        if city_id in processed_city_ids:
            logging.info(f"City '{city_id}' skipped (already processed according to log).")
            continue

        country_code = get_country_code(user_input, country_mappings)
        if country_code is None:
            logging.warning(f"No country code for {city_id}. Skipping.")
            continue
        city_dir = os.path.join(DATA_DIR, city_id)
        create_city_directories(DATA_DIR, city_id)

        processed_network_filepath = os.path.join(city_dir, f"{city_id}_processed_network.gpkg")
        processed_network_loaded = False
        gdf_edges = None

        try: # Try loading existing network
                if not os.path.exists(processed_network_filepath):
                    raise FileNotFoundError("Network file not found.")
                gdf_edges = gpd.read_file(processed_network_filepath)
                if gdf_edges.empty:
                    raise ValueError("Loaded network file is empty.")
                new_cols = []; seen_cols = set() # Deduplicate columns
                for col in gdf_edges.columns:
                    current_col_name = col
                    if col in seen_cols:
                        i = 1
                        new_col_candidate = f"{col}_{i}"
                        while new_col_candidate in seen_cols or new_col_candidate in gdf_edges.columns or new_col_candidate in new_cols:
                            i += 1
                            new_col_candidate = f"{col}_{i}"
                        current_col_name = new_col_candidate
                    new_cols.append(current_col_name)
                    seen_cols.add(current_col_name)
                gdf_edges.columns = new_cols
                # Don't add to all_gdf_edges here, just confirm load
                processed_network_loaded = True
                logging.info(f"Loaded pre-existing network for {city_id}. Skipping generation.")
        except Exception as e_load:
            logging.info(f"Network load failed for {city_id}: {e_load}. Generating.")
            processed_network_loaded = False

        if not processed_network_loaded:
                try:
                    area_gdf, bbox = get_area_from_input(user_input, city_id)
                    if area_gdf is None or bbox is None:
                        raise ValueError("Area GDF or BBox could not be determined.")

                    # --- Filter Loop ---
                    for filter_type, filter_values in filters.items():
                        if filter_values:
                            for filter_value in filter_values:
                                    logging.info(f"--- Processing Filter: {filter_value} ---")
                                    filter_value_str = filter_value.replace("=", "_")
                                    combined_filepath = os.path.join(city_dir, "combined", f"combined_{city_id}_{filter_value_str}.geojson")
                                    missing_filepath = os.path.join(city_dir, "missing", f"missing_{city_id}_{filter_value_str}.geojson")
                                    osm_processed_filepath = os.path.join(DATA_DIR, city_id, "OSM", f"OSM_{city_id}_{filter_value_str}.geojson")
                                    osm_raw_filepath = os.path.join(DATA_DIR, city_id, "raw", f"OSM_raw_{city_id}_{filter_value_str}.geojson")
                                    mapillary_processed_filepath = os.path.join(DATA_DIR, city_id, "Mapillary", f"Mapillary_{city_id}_{filter_value_str}.geojson")
                                    mapillary_raw_filepath = os.path.join(DATA_DIR, city_id, "raw", f"Mapillary_raw_{city_id}_{filter_value_str}.geojson")

                                    combined = None; combined_loaded = False
                                    try: # Load combined check...
                                        combined = gpd.read_file(combined_filepath)
                                        new_cols = []; seen_cols = set(); # Dedupe...
                                        for col in combined.columns:
                                            current_col_name = col
                                            if col in seen_cols:
                                                i = 1
                                            new_col_candidate = f"{col}_{i}"
                                            while new_col_candidate in seen_cols or new_col_candidate in combined.columns or new_col_candidate in new_cols:
                                                i += 1; new_col_candidate = f"{col}_{i}"
                                                current_col_name = new_col_candidate
                                            new_cols.append(current_col_name); seen_cols.add(current_col_name)
                                        combined.columns = new_cols
                                        current_city_combined_list.append(combined.copy())
                                        combined_loaded = True; logging.info(f"Loaded existing combined: {combined_filepath}")
                                        if not os.path.exists(missing_filepath): # Ensure missing exists
                                            missing_schema = {'geometry': 'Point', 'properties': OrderedDict([('id', 'int')])}
                                            empty_missing = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                            try:
                                                empty_missing.to_file(missing_filepath, driver="GeoJSON", schema=missing_schema)
                                            except Exception as e:
                                                pass
                                        continue
                                    except (FileNotFoundError, DriverError): logging.info(f"Combined not found: {combined_filepath}.")
                                    except Exception as e_lc: logging.error(f"Error loading combined: {e_lc}")

                                    if not combined_loaded:
                                        skip_download_filter = False; osm_data = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS); filtered_mapillary_data = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                                        try: osm_data = gpd.read_file(osm_processed_filepath); filtered_mapillary_data = gpd.read_file(mapillary_processed_filepath); skip_download_filter = True; logging.info("Pre-loaded files.")
                                        except (FileNotFoundError, DriverError): logging.info("Pre-loading failed.")
                                        except Exception as e_pre: logging.error(f"Preload error: {e_pre}")

                                        if not skip_download_filter:
                                               if filter_type == 'osm':
                                                    osm_tags_to_download = {filter_value}
                                                    osm_data = load_data(osm_processed_filepath, osm_raw_filepath, download_osm_data, osm_tags_to_download, area_gdf.geometry[0])
                                                    # --- Save processed OSM data ---
                                                    if not osm_data.empty:
                                                         osm_data_to_save = convert_lists_to_strings(osm_data.copy())
                                                         if not osm_data_to_save.geometry.is_valid.all(): # Validation
                                                             try:
                                                                 orig_crs_osm = osm_data_to_save.crs
                                                                 osm_data_to_save.geometry = osm_data_to_save.geometry.buffer(0)
                                                                 osm_data_to_save = osm_data_to_save[~osm_data_to_save.geometry.is_empty]
                                                                 if osm_data_to_save.empty:
                                                                     logging.warning(f"All proc OSM geoms empty after buffer for {filter_value}.")
                                                                 elif osm_data_to_save.crs != orig_crs_osm:
                                                                     osm_data_to_save = osm_data_to_save.set_crs(orig_crs_osm, allow_override=True)
                                                             except Exception as buffer_err_osm:
                                                                 logging.error(f"Error OSM buffer: {buffer_err_osm}")
                                                                 osm_data_to_save = convert_lists_to_strings(osm_data.copy()) # Revert
                                                         if not osm_data_to_save.empty: # Save if not empty
                                                              try:
                                                                  osm_data_to_save.to_file(osm_processed_filepath, driver="GeoJSON")
                                                                  logging.info(f"Saved proc OSM: {osm_processed_filepath}")
                                                              except Exception as save_osm_err:
                                                                  logging.error(f"Fail save proc OSM: {save_osm_err}")
                                                                  osm_data = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                                                         else: # Save empty (buffer)
                                                              empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                                              schema = create_schema(empty_gdf)
                                                              try:
                                                                  empty_gdf.to_file(osm_processed_filepath, driver="GeoJSON", schema=schema)
                                                              except Exception as e_seb:
                                                                  logging.error(f"Failed save empty proc OSM (buffer empty): {e_seb}")
                                                    else: # Save empty (download)
                                                         empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                                         schema = create_schema(empty_gdf)
                                                         try:
                                                             empty_gdf.to_file(osm_processed_filepath, driver="GeoJSON", schema=schema)
                                                         except Exception as e_sed:
                                                             logging.error(f"Failed save empty proc OSM (download empty): {e_sed}")
                                                    # --- Determine Mapillary filters ---
                                                    mapillary_filter_values = []
                                                    if 'osm_tag_1' in mapillary_osm_mapping.columns and 'mapillary_feature' in mapillary_osm_mapping.columns:
                                                         mapillary_filter_values.extend(mapillary_osm_mapping[mapillary_osm_mapping['osm_tag_1'] == filter_value]['mapillary_feature'].dropna().tolist())
                                                    if country_code and country_code in country_mappings:
                                                         country_df_map = country_mappings[country_code]
                                                         if 'mapillary_feature' in country_df_map.columns:
                                                             for _, row in country_df_map.iterrows():
                                                                 for osm_col in OSM_TAG_COLS:
                                                                     if osm_col in row and pd.notna(row[osm_col]) and row[osm_col] == filter_value:
                                                                         mapillary_feature = str(row['mapillary_feature'])
                                                                         if pd.notna(mapillary_feature) and mapillary_feature != '':
                                                                             mapillary_filter_values.append(mapillary_feature)
                                                                         break
                                                    mapillary_filter_values = [str(x) for x in set(mapillary_filter_values) if x is not None and x != '']
                                                    processed_mapillary_filters.update(mapillary_filter_values)
                                                    # --- Load/Download/Process Mapillary ---
                                                    if mapillary_filter_values:
                                                         mapillary_data_gdf = load_data(mapillary_processed_filepath, mapillary_raw_filepath, download_mapillary_data, bbox, mapillary_filter_values, country_code, country_mappings, existed_at, existed_before)
                                                         filtered_mapillary_data = filter_mapillary_data(mapillary_data_gdf, area_gdf)
                                                         logging.info(f"Spatially filtered Mply {filter_value}: {len(filtered_mapillary_data)} features")
                                                         # <<< Block 1: Mply Tag Merge/Filter/Accumulate/Save >>>
                                                         if not filtered_mapillary_data.empty:
                                                              if 'mapillary_feature' not in filtered_mapillary_data.columns:
                                                                  if 'value' in filtered_mapillary_data.columns:
                                                                      filtered_mapillary_data['mapillary_feature'] = filtered_mapillary_data['value'].astype(str).str.strip()
                                                                  else:
                                                                      filtered_mapillary_data = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                                                              else:
                                                                  filtered_mapillary_data['mapillary_feature'] = filtered_mapillary_data['mapillary_feature'].astype(str).str.strip()
                                                              if not filtered_mapillary_data.empty: # Continue processing
                                                                  mapillary_osm_mapping_clean = mapillary_osm_mapping.copy()
                                                                  mapillary_osm_mapping_clean['mapillary_feature'] = mapillary_osm_mapping_clean['mapillary_feature'].astype(str).str.strip()
                                                                  global_mapping_cols = ['mapillary_feature'] + [col for col in OSM_TAG_COLS if col in mapillary_osm_mapping_clean.columns]
                                                                  filtered_mapillary_data = filtered_mapillary_data.merge( mapillary_osm_mapping_clean[global_mapping_cols].drop_duplicates(subset=['mapillary_feature']), on='mapillary_feature', how='left' )
                                                                  country_merge_happened = False
                                                                  if country_code and country_code in country_mappings:
                                                                      country_df = country_mappings[country_code].copy()
                                                                      country_df['mapillary_feature'] = country_df['mapillary_feature'].astype(str).str.strip()
                                                                      country_mapping_cols = ['mapillary_feature'] + [col for col in OSM_TAG_COLS if col in country_df.columns]
                                                                      if 'country_tag' in country_df.columns:
                                                                          country_mapping_cols.append('country_tag')
                                                                      filtered_mapillary_data = filtered_mapillary_data.merge( country_df[list(set(country_mapping_cols))].drop_duplicates(subset=['mapillary_feature']), on='mapillary_feature', how='left', suffixes=('', '_country') )
                                                                      country_merge_happened = True
                                                                  else:
                                                                      logging.warning(f"Block 1 - No country map for {country_code}")
                                                                  if country_merge_happened: # Conflict resolution
                                                                      for osm_col in OSM_TAG_COLS:
                                                                          suffixed_col = f"{osm_col}_country"
                                                                          if suffixed_col in filtered_mapillary_data.columns:
                                                                              filtered_mapillary_data[osm_col] = filtered_mapillary_data[osm_col].fillna(filtered_mapillary_data[suffixed_col])
                                                                              filtered_mapillary_data = filtered_mapillary_data.drop(columns=[suffixed_col], errors='ignore')
                                                                  filtered_mapillary_data = filtered_mapillary_data.loc[:, ~filtered_mapillary_data.columns.duplicated()] # Dedupe cols
                                                                  cols_to_keep = ['geometry', 'mapillary_feature'] + OSM_TAG_COLS + ['country_tag']
                                                                  cols_to_keep_actual = [c for c in cols_to_keep if c in filtered_mapillary_data.columns] # Select cols
                                                                  if not cols_to_keep_actual:
                                                                      filtered_mapillary_data = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                                                                  else: # Clean cols
                                                                       filtered_mapillary_data = filtered_mapillary_data[cols_to_keep_actual]
                                                                       tag_columns_present = [col for col in OSM_TAG_COLS if col in filtered_mapillary_data.columns] + (['country_tag'] if 'country_tag' in filtered_mapillary_data.columns else [])
                                                                       for col in tag_columns_present:
                                                                            if pd.api.types.is_object_dtype(filtered_mapillary_data[col]):
                                                                                filtered_mapillary_data[col] = filtered_mapillary_data[col].astype(str).replace('nan', '').fillna('')
                                                                            else:
                                                                                filtered_mapillary_data[col] = filtered_mapillary_data[col].fillna('')
                                                              # CRS Check/Enforce UTM
                                                              if not filtered_mapillary_data.empty and filtered_mapillary_data.crs != UTM_CRS:
                                                                  try:
                                                                      filtered_mapillary_data = filtered_mapillary_data.to_crs(UTM_CRS)
                                                                  except Exception as e_crs_b1:
                                                                      logging.error(f"Block 1 CRS Fail: {e_crs_b1}")
                                                                      filtered_mapillary_data = gpd.GeoDataFrame()
                                                              # --- country_tag Filter Block 1 REMOVED ---
                                                         # Accumulate for city
                                                         if not filtered_mapillary_data.empty:
                                                              if 'source' not in filtered_mapillary_data.columns:
                                                                  filtered_mapillary_data['source'] = 'Mapillary'
                                                              city_mapillary_signs_processed_list.append(filtered_mapillary_data.copy())
                                                         # Save processed Mply for this filter
                                                         if not filtered_mapillary_data.empty:
                                                             try:
                                                                 if 'geometry' in filtered_mapillary_data.columns and filtered_mapillary_data.geometry.name != 'geometry':
                                                                     filtered_mapillary_data = filtered_mapillary_data.set_geometry('geometry')
                                                                 filtered_mapillary_data.to_file(mapillary_processed_filepath, driver="GeoJSON")
                                                                 logging.info(f"Block 1 - Saved proc Mply: {mapillary_processed_filepath}")
                                                             except Exception as save_map_err:
                                                                 logging.error(f"Block 1 - Fail save proc Mply: {save_map_err}")
                                                                 empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                                                 schema = create_schema(empty_gdf)
                                                                 try:
                                                                     empty_gdf.to_file(mapillary_processed_filepath, driver="GeoJSON", schema=schema)
                                                                 except Exception as e_fb_save:
                                                                     logging.error(f"Failed fallback save: {e_fb_save}")
                                                         else: # Save empty
                                                              empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                                              schema = create_schema(empty_gdf)
                                                              try:
                                                                  empty_gdf.to_file(mapillary_processed_filepath, driver="GeoJSON", schema=schema)
                                                              except Exception as e_save_empty_mply1:
                                                                  logging.error(f"Failed save empty Mply (filter empty): {e_save_empty_mply1}")
                                                    else: # No Mply filters
                                                         filtered_mapillary_data = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                                                         empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                                         schema = create_schema(empty_gdf)
                                                         try:
                                                             empty_gdf.to_file(mapillary_processed_filepath, driver="GeoJSON", schema=schema)
                                                         except Exception as e_save_empty_mply_nf:
                                                             logging.error(f"Failed save empty Mply (no filters): {e_save_empty_mply_nf}")
                                                    # --- End Mapillary ---

                                        # --- Conflation Stage (Main Combined/Missing files) ---
                                        start_time = time.time()
                                        df1 = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS); df2 = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                                        if isinstance(osm_data, gpd.GeoDataFrame) and not osm_data.empty:
                                            try:
                                                df1 = osm_data.to_crs(UTM_CRS) if osm_data.crs != UTM_CRS else osm_data.copy()
                                            except Exception as e_p1:
                                                logging.error(f"Error projecting df1 for conflation: {e_p1}")
                                        if isinstance(filtered_mapillary_data, gpd.GeoDataFrame) and not filtered_mapillary_data.empty:
                                            try:
                                                df2 = filtered_mapillary_data.to_crs(UTM_CRS) if filtered_mapillary_data.crs != UTM_CRS else filtered_mapillary_data.copy()
                                            except Exception as e_p2:
                                                logging.error(f"Error projecting df2 for conflation: {e_p2}")

                                        combined = None; missing = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS); overlap = gpd.GeoDataFrame(); properties_to_keep_conflation = ['mapillary_feature'] + OSM_TAG_COLS + ['country_tag', 'source']
                                        logging.info(f"----- Conflation Check (Main Combined/Missing) Filter: {filter_value} -----")
                                        logging.info(f"df1 (OSM) empty: {df1.empty}, df2 (Mply) empty: {df2.empty}")
                                        # Perform Conflation...
                                        if not df1.empty and not df2.empty:
                                            if transport_agency_data_exists:
                                                agency_df_proj = gpd.GeoDataFrame(crs=UTM_CRS)
                                                try:
                                                    if isinstance(transport_agency_osm_df, gpd.GeoDataFrame) and not transport_agency_osm_df.empty:
                                                        agency_df_proj = transport_agency_osm_df.to_crs(UTM_CRS) if transport_agency_osm_df.crs != UTM_CRS else transport_agency_osm_df.copy()
                                                except Exception as e_pa:
                                                    logging.error(f"Error projecting agency_df: {e_pa}")
                                                if not agency_df_proj.empty:
                                                    base_df, add1_df, add2_df = agency_df_proj.copy(), df1.copy(), df2.copy()
                                                    osm_mapper_flag = False
                                                    combined, _, _, _, overlap, missing = analyze_overlap_3dfs( base_df, add1_df, add2_df, properties_to_keep=properties_to_keep_conflation, osm_mapper_mode=osm_mapper_flag, buffer_distance=BUFFER_DISTANCE)
                                                else:
                                                    base_df, add_df = df1.copy(), df2.copy()
                                                    combined, overlap, missing = analyze_overlap_2dfs( base_df, add_df, properties_to_keep=properties_to_keep_conflation, buffer_distance=BUFFER_DISTANCE)
                                            else:
                                                base_df, add_df = df1.copy(), df2.copy()
                                                combined, overlap, missing = analyze_overlap_2dfs( base_df, add_df, properties_to_keep=properties_to_keep_conflation, buffer_distance=BUFFER_DISTANCE)
                                        elif not df1.empty:
                                            combined = df1.copy(); missing = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS); overlap = gpd.GeoDataFrame(); combined['source'] = combined.get('source', 'OSM')
                                        elif not df2.empty:
                                            combined = df2.copy(); missing = df2.copy(); overlap = gpd.GeoDataFrame(); combined['source'] = combined.get('source', 'Mapillary'); missing['source'] = missing.get('source', 'Mapillary')
                                        else:
                                            combined = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS); missing = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS); overlap = gpd.GeoDataFrame()

                                        # --- Process and Save Conflation Results ---
                                        if combined is not None:
                                            # Convert/save combined...
                                            combined_gdf, _ = convert_osm_tags_to_kv(combined.copy())
                                            if isinstance(combined_gdf, gpd.GeoDataFrame) and not combined_gdf.empty:
                                                if combined_gdf.crs != UTM_CRS:
                                                    try:
                                                        combined_gdf = combined_gdf.to_crs(UTM_CRS)
                                                    except Exception:
                                                        combined_gdf = gpd.GeoDataFrame()
                                                if not combined_gdf.empty and not combined_gdf.geometry.is_valid.all(): # Validation...
                                                    try:
                                                        orig_crs = combined_gdf.crs
                                                        combined_gdf.geometry = combined_gdf.geometry.buffer(0)
                                                        combined_gdf = combined_gdf[~combined_gdf.geometry.is_empty]
                                                        if not combined_gdf.empty and combined_gdf.crs != orig_crs:
                                                            combined_gdf = combined_gdf.set_crs(orig_crs, allow_override=True)
                                                    except Exception: pass
                                                if not combined_gdf.empty: # Save if still valid
                                                    try:
                                                        combined_gdf_to_save = combined_gdf.drop(columns=['city_id'], errors='ignore')
                                                        combined_gdf_to_save.to_file(combined_filepath, driver="GeoJSON")
                                                    except Exception as e:
                                                        logging.error(f"Failed save combined: {e}")
                                                    current_city_combined_list.append(combined_gdf.copy())
                                                else: # Save empty combined
                                                    empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                                    schema = create_schema(empty_gdf)
                                                    try:
                                                        empty_gdf.to_file(combined_filepath, driver="GeoJSON", schema=schema)
                                                    except Exception: pass
                                            else: # Save empty combined
                                                empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                                schema = create_schema(empty_gdf)
                                                try:
                                                    empty_gdf.to_file(combined_filepath, driver="GeoJSON", schema=schema)
                                                except Exception: pass
                                            # Convert/save missing...
                                            missing_gdf, _ = convert_osm_tags_to_kv(missing.copy())
                                            if isinstance(missing_gdf, gpd.GeoDataFrame) and not missing_gdf.empty:
                                                missing_gdf = missing_gdf.dropna(axis=1, how='all')
                                                if missing_gdf.crs != UTM_CRS:
                                                    try:
                                                        missing_gdf = missing_gdf.to_crs(UTM_CRS)
                                                    except Exception:
                                                        missing_gdf = gpd.GeoDataFrame()
                                                if not missing_gdf.empty and not missing_gdf.geometry.is_valid.all(): # Validation...
                                                    try:
                                                        orig_crs = missing_gdf.crs
                                                        missing_gdf.geometry = missing_gdf.geometry.buffer(0)
                                                        missing_gdf = missing_gdf[~missing_gdf.geometry.is_empty]
                                                        if not missing_gdf.empty and missing_gdf.crs != orig_crs:
                                                            missing_gdf = missing_gdf.set_crs(orig_crs, allow_override=True)
                                                    except Exception: pass
                                                if not missing_gdf.empty: # Save if still valid
                                                    try:
                                                        missing_gdf_to_save = missing_gdf.drop(columns=['city_id'], errors='ignore')
                                                        missing_gdf_to_save.to_file(missing_filepath, driver="GeoJSON")
                                                    except Exception as e: logging.error(f"Failed save missing: {e}")
                                                else: # Save empty missing
                                                    empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                                    schema = create_schema(empty_gdf)
                                                    try:
                                                        empty_gdf.to_file(missing_filepath, driver="GeoJSON", schema=schema)
                                                    except Exception: pass
                                            else: # Save empty missing
                                                empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                                schema = create_schema(empty_gdf)
                                                try:
                                                    empty_gdf.to_file(missing_filepath, driver="GeoJSON", schema=schema)
                                                except Exception: pass
                                        else: logging.error(f"Combined df None for {filter_value}.")

                                    # --- End Filter Processing ('if not combined_loaded:') ---
                except Exception as e:
                    logging.error(f"A major error occurred during the data loading/generation phase for city '{city_id}': {e}", exc_info=True)
                    continue
            # --- End Filters Loop --- ('for filter_value in filter_values:')
        # --- End Filter Type Loop --- ('for filter_type, ...')

        # --- Download and Process Remaining Mapillary Traffic Signs ---
        if not processed_network_loaded:
            all_traffic_sign_mapillary_features = []
            processed_mapillary_filters_set = set(processed_mapillary_filters)
            for code, df_country in country_mappings.items():
                if 'mapillary_feature' in df_country.columns:
                    all_traffic_sign_mapillary_features.extend(df_country['mapillary_feature'].dropna().astype(str).str.strip().tolist())
            all_traffic_sign_mapillary_features = list(set(all_traffic_sign_mapillary_features) - processed_mapillary_filters_set)
            all_traffic_sign_mapillary_features = [x for x in all_traffic_sign_mapillary_features if x]

            mapillary_traffic_signs_raw_filepath = os.path.join(DATA_DIR, city_id, "raw", f"Mapillary_raw_{city_id}_traffic_signs.geojson")
            mapillary_traffic_signs_processed_filepath = os.path.join(DATA_DIR, city_id, "Mapillary", f"Mapillary_{city_id}_traffic_signs.geojson")

            if 'area_gdf' not in locals():
                try:
                    area_gdf, bbox = get_area_from_input(user_input, city_id)
                except Exception as e:
                    bbox = None
                    area_gdf = None
            if bbox and area_gdf is not None and all_traffic_sign_mapillary_features:
                logging.info(f"Processing {len(all_traffic_sign_mapillary_features)} remaining Mply traffic signs...")
                filtered_mapillary_traffic_signs = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                loaded_remaining = False
                try: # Try loading existing
                    filtered_mapillary_traffic_signs = gpd.read_file(mapillary_traffic_signs_processed_filepath)
                    loaded_remaining=True
                    logging.info(f"Loaded existing remaining signs.")
                    if filtered_mapillary_traffic_signs.crs != UTM_CRS:
                        filtered_mapillary_traffic_signs = filtered_mapillary_traffic_signs.to_crs(UTM_CRS)
                except: # Download if load fails
                    logging.info(f"Remaining TS file not found, downloading.")
                    mapillary_traffic_signs_gdf = load_data(mapillary_traffic_signs_processed_filepath, mapillary_traffic_signs_raw_filepath, download_mapillary_data, bbox, all_traffic_sign_mapillary_features, country_code, country_mappings, existed_at, existed_before )
                    filtered_mapillary_traffic_signs = filter_mapillary_data(mapillary_traffic_signs_gdf, area_gdf)
                    # Process ONLY if downloaded/filtered
                    # <<< START: Mapillary Tag Merging & Filtering Block 2 >>>
                    if not filtered_mapillary_traffic_signs.empty:
                            if 'mapillary_feature' not in filtered_mapillary_traffic_signs.columns:
                                if 'value' in filtered_mapillary_traffic_signs.columns:
                                    filtered_mapillary_traffic_signs['mapillary_feature'] = filtered_mapillary_traffic_signs['value'].astype(str).str.strip()
                                else:
                                    filtered_mapillary_traffic_signs = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                            else:
                                filtered_mapillary_traffic_signs['mapillary_feature'] = filtered_mapillary_traffic_signs['mapillary_feature'].astype(str).str.strip()
                            if not filtered_mapillary_traffic_signs.empty:
                                mapillary_osm_mapping_clean = mapillary_osm_mapping.copy()
                                mapillary_osm_mapping_clean['mapillary_feature'] = mapillary_osm_mapping_clean['mapillary_feature'].astype(str).str.strip()
                                global_mapping_cols = ['mapillary_feature'] + [col for col in OSM_TAG_COLS if col in mapillary_osm_mapping_clean.columns]
                                filtered_mapillary_traffic_signs = filtered_mapillary_traffic_signs.merge( mapillary_osm_mapping_clean[global_mapping_cols].drop_duplicates(subset=['mapillary_feature']), on='mapillary_feature', how='left' )
                                country_merge_happened_b2 = False
                                if country_code and country_code in country_mappings:
                                    country_df = country_mappings[country_code].copy()
                                    country_df['mapillary_feature'] = country_df['mapillary_feature'].astype(str).str.strip()
                                    country_mapping_cols = ['mapillary_feature'] + [col for col in OSM_TAG_COLS if col in country_df.columns]
                                    if 'country_tag' in country_df.columns:
                                        country_mapping_cols.append('country_tag')
                                    filtered_mapillary_traffic_signs = filtered_mapillary_traffic_signs.merge( country_df[list(set(country_mapping_cols))].drop_duplicates(subset=['mapillary_feature']), on='mapillary_feature', how='left', suffixes=('', '_country') )
                                    country_merge_happened_b2 = True
                                if country_merge_happened_b2:
                                    for osm_col in OSM_TAG_COLS:
                                        suffixed_col = f"{osm_col}_country"
                                        if suffixed_col in filtered_mapillary_traffic_signs.columns:
                                            filtered_mapillary_traffic_signs[osm_col] = filtered_mapillary_traffic_signs[osm_col].fillna(filtered_mapillary_traffic_signs[suffixed_col])
                                            filtered_mapillary_traffic_signs = filtered_mapillary_traffic_signs.drop(columns=[suffixed_col], errors='ignore')
                                filtered_mapillary_traffic_signs = filtered_mapillary_traffic_signs.loc[:, ~filtered_mapillary_traffic_signs.columns.duplicated()]
                                cols_to_keep = ['geometry', 'mapillary_feature'] + OSM_TAG_COLS + ['country_tag']
                                cols_to_keep_actual = [c for c in cols_to_keep if c in filtered_mapillary_traffic_signs.columns]
                                if not cols_to_keep_actual:
                                    filtered_mapillary_traffic_signs = gpd.GeoDataFrame(geometry=[], crs=UTM_CRS)
                                else:
                                    filtered_mapillary_traffic_signs = filtered_mapillary_traffic_signs[cols_to_keep_actual]
                                    tag_columns_present = [col for col in OSM_TAG_COLS if col in filtered_mapillary_traffic_signs.columns] + (['country_tag'] if 'country_tag' in filtered_mapillary_traffic_signs.columns else [])
                                    for col in tag_columns_present:
                                        if pd.api.types.is_object_dtype(filtered_mapillary_traffic_signs[col]):
                                            filtered_mapillary_traffic_signs[col] = filtered_mapillary_traffic_signs[col].astype(str).replace('nan', '').fillna('')
                                        else:
                                            filtered_mapillary_traffic_signs[col] = filtered_mapillary_traffic_signs[col].fillna('')
                            # CRS Check/Enforce UTM
                            if not filtered_mapillary_traffic_signs.empty and filtered_mapillary_traffic_signs.crs != UTM_CRS:
                                try:
                                    filtered_mapillary_traffic_signs = filtered_mapillary_traffic_signs.to_crs(UTM_CRS)
                                except Exception as crs_err_b2:
                                    logging.error(f"Block 2 - Failed re-projection: {crs_err_b2}")
                                    filtered_mapillary_traffic_signs = gpd.GeoDataFrame()
                    # <<< END: Mapillary Tag Merging & Filtering Block 2 >>>

                # --- Accumulate remaining Mapillary data ---
                if not filtered_mapillary_traffic_signs.empty:
                        if 'source' not in filtered_mapillary_traffic_signs.columns:
                            filtered_mapillary_traffic_signs['source'] = 'Mapillary'
                        if filtered_mapillary_traffic_signs.crs != UTM_CRS:
                            try:
                                filtered_mapillary_traffic_signs = filtered_mapillary_traffic_signs.to_crs(UTM_CRS)
                            except Exception:
                                pass # If conversion fails, log below
                        if filtered_mapillary_traffic_signs.crs == UTM_CRS:
                            city_mapillary_signs_processed_list.append(filtered_mapillary_traffic_signs.copy())
                        else:
                            logging.warning("Skipping accumulation remaining signs: CRS issue.")

                # Save remaining processed ONLY IF GENERATED this run
                if not loaded_remaining:
                        if not filtered_mapillary_traffic_signs.empty:
                            try:
                                if 'geometry' in filtered_mapillary_traffic_signs.columns and filtered_mapillary_traffic_signs.geometry.name != 'geometry':
                                    filtered_mapillary_traffic_signs = filtered_mapillary_traffic_signs.set_geometry('geometry')
                                filtered_mapillary_traffic_signs_to_save = filtered_mapillary_traffic_signs.drop(columns=['city_id'], errors='ignore')
                                filtered_mapillary_traffic_signs_to_save.to_file(mapillary_traffic_signs_processed_filepath, driver="GeoJSON")
                                logging.info(f"Block 2 - Saved processed remaining signs: {mapillary_traffic_signs_processed_filepath} ({len(filtered_mapillary_traffic_signs_to_save)} features).")
                            except Exception as save_map_rem_err:
                                logging.error(f"Block 2 - Failed save remaining Mply: {save_map_rem_err}")
                                # Save empty fallback...
                                empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                                schema = create_schema(empty_gdf)
                                try:
                                    empty_gdf.to_file(mapillary_traffic_signs_processed_filepath, driver="GeoJSON", schema=schema)
                                except Exception as e_fb_save_rem:
                                    logging.error(f"Failed fallback save remaining Mply: {e_fb_save_rem}")
                        else: # Save empty
                            empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                            schema = create_schema(empty_gdf)
                            try:
                                empty_gdf.to_file(mapillary_traffic_signs_processed_filepath, driver="GeoJSON", schema=schema)
                            except Exception as e_save_empty_rem:
                                logging.error(f"Failed save empty remaining Mply: {e_save_empty_rem}")
            else: # Skip remaining signs
                logging.warning("Skipping remaining traffic signs: issue with bbox/area/features.")
                if not os.path.exists(mapillary_traffic_signs_processed_filepath): # Ensure empty file exists
                        empty_gdf = gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=UTM_CRS)
                        schema = create_schema(empty_gdf)
                        try:
                            empty_gdf.to_file(mapillary_traffic_signs_processed_filepath, driver="GeoJSON", schema=schema)
                        except Exception as e_save_empty_rem_skip:
                            logging.error(f"Failed save empty remaining signs (skipped): {e_save_empty_rem_skip}")

        # --- Consolidate accumulated Mapillary signs for the city ---
        all_city_mapillary_signs_processed = gpd.GeoDataFrame(crs=UTM_CRS)
        if city_mapillary_signs_processed_list:
            try:
                valid_parts = [gdf for gdf in city_mapillary_signs_processed_list if isinstance(gdf, gpd.GeoDataFrame) and 'geometry' in gdf.columns and not gdf.empty]
                if valid_parts:
                    all_city_mapillary_signs_processed = pd.concat(valid_parts, ignore_index=True, sort=False)
                    if not isinstance(all_city_mapillary_signs_processed, gpd.GeoDataFrame):
                        all_city_mapillary_signs_processed = gpd.GeoDataFrame(all_city_mapillary_signs_processed, geometry='geometry', crs=UTM_CRS)
                    elif all_city_mapillary_signs_processed.crs != UTM_CRS:
                        all_city_mapillary_signs_processed = all_city_mapillary_signs_processed.to_crs(UTM_CRS)
                    # Initial Deduplication...
                    if not all_city_mapillary_signs_processed.empty and 'mapillary_feature' in all_city_mapillary_signs_processed.columns and 'geometry' in all_city_mapillary_signs_processed.columns:
                        all_city_mapillary_signs_processed['geom_wkt_temp'] = all_city_mapillary_signs_processed.geometry.to_wkt()
                        initial_count = len(all_city_mapillary_signs_processed)
                        all_city_mapillary_signs_processed.drop_duplicates(subset=['mapillary_feature', 'geom_wkt_temp'], keep='first', inplace=True)
                        all_city_mapillary_signs_processed.drop(columns=['geom_wkt_temp'], inplace=True)
                        if initial_count > len(all_city_mapillary_signs_processed):
                            logging.info(f"Removed {initial_count - len(all_city_mapillary_signs_processed)} duplicate Mply signs.")
                    # Spatial Deduplication...
                    if not all_city_mapillary_signs_processed.empty and 'traffic_sign' in all_city_mapillary_signs_processed.columns:
                        signs_to_dedupe = all_city_mapillary_signs_processed.copy()
                        if signs_to_dedupe.crs != UTM_CRS:
                            try:
                                signs_to_dedupe = signs_to_dedupe.to_crs(UTM_CRS)
                            except Exception:
                                signs_to_dedupe = gpd.GeoDataFrame() # Make empty if projection fails
                        if not signs_to_dedupe.empty:
                                signs_buffered = signs_to_dedupe.copy()
                                signs_buffered['geometry'] = signs_buffered.geometry.buffer(BUFFER_DISTANCE)
                                nearby_pairs = gpd.sjoin(signs_to_dedupe, signs_buffered, how='inner', predicate='within', lsuffix='left', rsuffix='right')
                                nearby_pairs = nearby_pairs[nearby_pairs.index != nearby_pairs['index_right']]
                                nearby_pairs['traffic_sign_left'] = nearby_pairs.index.map(signs_to_dedupe['traffic_sign'])
                                nearby_pairs['traffic_sign_right'] = nearby_pairs['index_right'].map(signs_to_dedupe['traffic_sign'])
                                identical_nearby = nearby_pairs[(nearby_pairs['traffic_sign_left'] == nearby_pairs['traffic_sign_right']) & (nearby_pairs['traffic_sign_left'] != '')]
                                indices_to_drop = set()
                                if not identical_nearby.empty:
                                    indices_higher = identical_nearby.apply(lambda row: max(row.name, row['index_right']), axis=1)
                                    indices_to_drop.update(indices_higher.tolist())
                                if indices_to_drop:
                                    count_before = len(all_city_mapillary_signs_processed)
                                    all_city_mapillary_signs_processed.drop(index=list(indices_to_drop), inplace=True)
                                    logging.info(f"Removed {count_before - len(all_city_mapillary_signs_processed)} nearby identical Mply signs.")
                else:
                    logging.warning("No valid GDFs to consolidate Mply signs.")
            except Exception as concat_err:
                logging.error(f"Error consolidating/dedup Mply signs: {concat_err}")
                all_city_mapillary_signs_processed = gpd.GeoDataFrame(crs=UTM_CRS) # Reset
        logging.info(f"Consolidated {len(all_city_mapillary_signs_processed)} unique Mply signs for {city_id}.")

        # --- Network Processing ---
        logging.info("Starting network processing...")
        current_city_combined = gpd.GeoDataFrame(crs=UTM_CRS)
        if 'current_city_combined_list' in locals() and current_city_combined_list:
            valid_parts = [gdf for gdf in current_city_combined_list if isinstance(gdf, gpd.GeoDataFrame) and not gdf.empty]
            if valid_parts:
                try:
                    current_city_combined = pd.concat(valid_parts, ignore_index=True, sort=False)
                    if not isinstance(current_city_combined, gpd.GeoDataFrame):
                        current_city_combined = gpd.GeoDataFrame(current_city_combined, geometry='geometry', crs=UTM_CRS)
                    if current_city_combined.crs != UTM_CRS:
                        current_city_combined = current_city_combined.to_crs(UTM_CRS)
                except Exception as e_concat_city:
                    logging.error(f"Error concatenating combined data for {city_id}: {e_concat_city}")
                    current_city_combined = gpd.GeoDataFrame(crs=UTM_CRS) # Ensure it's empty on failure
        else:
             if not processed_network_loaded: # Only warn if we were actually generating the network
                logging.warning(f"No combined data was loaded or generated for {city_id}. Way tag processing will be limited.")

        try: # Process network
            if 'boundary_polygon' not in locals(): boundary_polygon = area_gdf.geometry.iloc[0]
            if boundary_polygon is None: raise ValueError("Boundary polygon missing for graph download")

            gdf_nodes_proj, gdf_edges, G_projected = download_and_process_osm_graph(boundary_polygon) # Get projected graph
            if G_projected is None or gdf_edges.empty: raise ValueError("Graph download/projection failed")

            # --- Call processing functions ---
            gdf_edges = process_cycleways_and_busways(gdf_edges)
            if gdf_edges.crs != UTM_CRS: gdf_edges = gdf_edges.to_crs(UTM_CRS) # Ensure edges are UTM

            if isinstance(current_city_combined, gpd.GeoDataFrame) and not current_city_combined.empty:
                gdf_edges = process_way_tags(gdf_edges, current_city_combined) # Crossings
            else:
                logging.warning("Cannot process way tags (crossings); input data invalid/empty.")
                for item_value_cross in way_tags.get('crossing', []): gdf_edges[f"crossing_{item_value_cross}_count"] = 0
            gdf_edges = process_traffic_signs(gdf_edges, user_input, current_city_combined, all_city_mapillary_signs_processed, city_id, DATA_DIR, G_projected, boundary_polygon)

            # --- Final Column Selection and Cleanup ---
            final_columns_to_keep = [
                'geometry', 'highway', 'length', 'lanes', 'maxspeed', 'surface',
                'cycle_lane', 'shared_cycle', 'bus_lane',
                'crossing_uncontrolled_count', 'crossing_unmarked_count', 'crossing_traffic_signals_count',
                'highway_traffic_signals_count', 'highway_stop_count', 'highway_give_way_count',
                'traffic_sign_count'
            ] # REMOVED city_id and all internal OSMnx IDs
            actual_columns = [col for col in final_columns_to_keep if col in gdf_edges.columns]
            gdf_edges = gdf_edges[actual_columns].copy()
            if 'surface' in gdf_edges.columns: gdf_edges['surface'] = gdf_edges['surface'].fillna('unknown')
            if 'maxspeed' in gdf_edges.columns: gdf_edges['maxspeed'] = gdf_edges['maxspeed'].fillna('')
            if 'lanes' in gdf_edges.columns: gdf_edges['lanes'] = gdf_edges['lanes'].fillna('').astype(str).replace(['nan', 'None', 'NULL'], '', regex=False)
            count_cols_final = [col for col in gdf_edges.columns if '_count' in col]
            for ccol in count_cols_final:
                if ccol in gdf_edges.columns:
                    gdf_edges[ccol] = gdf_edges[ccol].fillna(0)
                    try:
                        gdf_edges[ccol] = gdf_edges[ccol].astype(np.int64)
                    except Exception:
                        pass
            gdf_edges = convert_lists_to_strings(gdf_edges)
            # --- End Final Column Selection ---

            # Save network
            try:
                logging.debug(f"Columns JUST BEFORE saving network GPKG: {gdf_edges.columns.tolist()}")
                gdf_edges.to_file(processed_network_filepath, driver="GPKG")
                logging.info(f"Saved processed network: {processed_network_filepath}")
            except Exception as save_net_err:
                logging.error(f"!!! Failed to save processed network: {save_net_err}", exc_info=True)

        except Exception as net_proc_err:
            logging.error(f"Error during network processing steps for {city_id}: {net_proc_err}", exc_info=True)

        # --- Calculate and Log City Processing Time ---
        # This runs whether network was loaded or generated (if successful)
        if processed_network_loaded or ('gdf_edges' in locals() and gdf_edges is not None and not gdf_edges.empty):
            end_city_time = time.time(); elapsed_city_time = end_city_time - start_city_time
            logging.info(f"--- Finished city {city_id} in {elapsed_city_time:.2f} sec ---")
            city_times_list.append({'city_id': city_id, 'user_input': user_input, 'processing_time_seconds': round(elapsed_city_time, 2)})
            processed_city_ids.add(city_id) # Add to processed set
            try: pd.DataFrame(city_times_list).to_csv(city_processing_log_file, sep=';', index=False); logging.debug(f"Saved city times.")
            except Exception as e_save_times: logging.warning(f"Could not save city times: {e_save_times}")
        else:
            logging.warning(f"Skipping time log for {city_id}: processing did not complete successfully.")
        # ---------------------------------------------

    # --- Log Overall Time ---
    overall_end_time = time.time()
    logging.info(f"\n=== Total Script Execution Time: {(overall_end_time - overall_start_time):.2f} seconds ===")

    # --- Final Save of City Times ---
    try:
         pd.DataFrame(city_times_list).to_csv(city_processing_log_file, sep=';', index=False)
         logging.info(f"Final city processing times saved to {city_processing_log_file}")
    except Exception as e_save_times_final:
         logging.warning(f"Could not save final city processing times: {e_save_times_final}")

In [27]:
# Create optional filters
filters = {
    'mapillary': None,
    'osm': ['highway=stop', 'highway=traffic_signals', 'highway=give_way', 'crossing=uncontrolled', 'crossing=traffic_signals', 'crossing=unmarked'],
    'country_tag': None,
    'agency_id': None,
}

In [28]:
# Example usage
if __name__ == "__main__":
    user_inputs = [
    "Nuku'alofa, Tonga",
    "Apia, Samoa",
    "Honiara, Solomon Islands",
    "Reykjavik, Iceland",
    "Suva, Fiji",
    "National Capital District, Papua New Guinea",
    "Wellington City, New Zealand",
    "Kigali, Rwanda",
    "Oslo, Norway",
    "Zurich, Switzerland",
    "San José, Costa Rica",
    "Auckland, New Zealand",
    "Sofia, Bulgaria",
    "Belgrade, Serbia",
    "Montevideo, Uruguay",
    "Calgary, Canada",
    "Distrito de Panamá, Panama",
    "Vienna, Austria",
    "Dublin, Ireland",
    "Perth, Australia",
    "Prague, Czech Republic",
    "Bucharest, Romania",
    "Brussels, Belgium",
    "La Paz, Bolivia",
    "Baku, Azerbaijan",
    "Doha, Qatar",
    "Beirut, Lebanon",
    "Amsterdam, Netherlands",
    "Brisbane, Australia",
    "Vancouver, Canada",
    "Ciudad de Túnez, Tunisia",
    "Caracas, Venezuela",
    "Budapest, Hungary",
    "Tashkent, Uzbekistan",
    "Kyiv, Ukraine",
    "Warsaw, Poland",
    "Quito, Ecuador",
    "Dakar, Senegal",
    "Lusaka, Zambia",
    "Dubai, United Arab Emirates",
    "Algiers, Algeria",
    "Casablanca, Morocco",
    "Kampala, Uganda",
    "Medellín, Colombia",
    "Berlin, Germany",
    "Rome, Italy",
    "Montreal, Canada",
    "Amman, Jordan",
    "San Francisco, United States",
    "City of Cape Town, South Africa",
    "Boston, United States",
    "Accra, Ghana",
    "Monterrey, Mexico",
    "Nairobi, Kenya",
    "Melbourne, Australia",
    "Guadalajara, Mexico",
    "Sydney, Australia",
    "Alexandria, Egypt",
    "Colombo, Sri Lanka",
    "Yangon, Myanmar",
    "Singapore, Singapore",
    "Miami, United States",
    "Toronto, Canada",
    "Madrid, Spain",
    "Santiago, Chile",
    "Houston, United States",
    "Riyadh Governorate, Saudi Arabia",
    "Baghdad, Iraq",
    "Dar es Salaam, Tanzania",
    "Kuala Lumpur, Malaysia",
    "Chicago, United States",
    "Tehran, Iran",
    "Bangkok, Thailand",
    "Provincia de Lima, Peru",
    "Paris, France",
    "Bogotá, Colombia",
    "Chennai, India",
    "Ho Chi Minh City, Vietnam",
    "Manila, Philippines",
    "London, United Kingdom",
    "Buenos Aires, Argentina",
    "Istanbul, Turkey",
    "Lagos, Nigeria",
    "Karachi, Pakistan",
    "Moscow, Russia",
    "Los Angeles, United States",
    "New York, United States",
    "Mumbai, India",
    "Beijing, China",
    "Mexico City, Mexico",
    "Cairo, Egypt",
    "Município de São Paulo, Brazil",
    "Dhaka, Bangladesh",
    "Shanghai, China",
    "Delhi, India",
    "Jakarta, Indonesia",
    "Tokyo, Japan",
    "Coimbra, Portugal",
    "Fortaleza, Brazil",
    "Hamburg, Germany",
]
    main(user_inputs)

INFO:root:Loaded existing processing times for 100 cities.
INFO:root:Starting processing...
INFO:root:
=== Processing City: Nuku'alofa, Tonga ===
INFO:root:City 'nukualofa' skipped (already processed according to log).
INFO:root:
=== Processing City: Apia, Samoa ===
INFO:root:City 'apia' skipped (already processed according to log).
INFO:root:
=== Processing City: Honiara, Solomon Islands ===
INFO:root:City 'honiara' skipped (already processed according to log).
INFO:root:
=== Processing City: Reykjavik, Iceland ===
INFO:root:City 'reykjavik' skipped (already processed according to log).
INFO:root:
=== Processing City: Suva, Fiji ===
INFO:root:City 'suva' skipped (already processed according to log).
INFO:root:
=== Processing City: National Capital District, Papua New Guinea ===
INFO:root:City 'national_capital_district' skipped (already processed according to log).
INFO:root:
=== Processing City: Wellington City, New Zealand ===
INFO:root:City 'wellington_city' skipped (already proces

In [29]:
# After the main function has successfully created all individual city networks,
# this code will merge them into a single file.

print("\n=== Starting Final Aggregation of All Processed Networks ===")

# Set this to True to also save a CSV file of the final network
SAVE_AS_CSV = True

# Find all the processed network files using the correct DATA_DIR path
processed_network_files = glob.glob(os.path.join(DATA_DIR, "*", "*_processed_network.gpkg"))

if not processed_network_files:
    print(f"No processed network files found in '{DATA_DIR}'. Please check the path and ensure the main script ran successfully.")
else:
    print(f"Found {len(processed_network_files)} network files to merge from '{DATA_DIR}'.")
    
    # List to hold the individual GeoDataFrames
    all_networks_list = []
    
    for f in processed_network_files:
        try:
            # Extract city_id from the file path to add as a column
            city_id = os.path.basename(os.path.dirname(f))
            print(f"Reading: {city_id} ({os.path.basename(f)})")
            gdf = gpd.read_file(f)
            gdf['city_id'] = city_id # Add a column to identify the city
            all_networks_list.append(gdf)
        except Exception as e:
            print(f"Could not read or process file {f}. Error: {e}")

    if all_networks_list:
        # Concatenate all GeoDataFrames in the list
        final_merged_network = pd.concat(all_networks_list, ignore_index=True)
        
        output_path_gpkg = os.path.join(RESULTS_DIR, "full_network.gpkg")
        output_path_csv = os.path.join(RESULTS_DIR, "full_network.csv")
        
        # --- Save the primary GeoPackage (.gpkg) file ---
        try:
            final_merged_network.to_file(output_path_gpkg, driver="GPKG")
            print(f"\nSuccessfully saved the final merged network to: {output_path_gpkg}")
            print(f"Total Edges in Merged Network: {len(final_merged_network)}")
        except Exception as e:
            print(f"\nFailed to save the final merged GPKG network. Error: {e}")

        # --- Optionally, save the network as a CSV file ---
        if SAVE_AS_CSV:
            print(f"Now saving as CSV to: {output_path_csv}")
            try:
                final_merged_network.to_csv(output_path_csv, index=False, sep=';')
                print(f"Successfully saved CSV version of the network.")
            except Exception as e:
                print(f"Failed to save the CSV network. Error: {e}")
        
    else:
        print("No networks were successfully read, so no final file was created.")


=== Starting Final Aggregation of All Processed Networks ===
Found 100 network files to merge from '/Users/Matheus/Library/CloudStorage/OneDrive-Pessoal/Estudos/2 - Pesquisas/3 - CITTA/1 - RAMCCAV/3 - Assessment of Physical Road Infrastructure/Codigos/data'.
Reading: zurich (zurich_processed_network.gpkg)
Reading: calgary (calgary_processed_network.gpkg)
Reading: new_york (new_york_processed_network.gpkg)
Reading: paris (paris_processed_network.gpkg)
Reading: dakar (dakar_processed_network.gpkg)
Reading: mumbai (mumbai_processed_network.gpkg)
Reading: riyadh_governorate (riyadh_governorate_processed_network.gpkg)
Reading: karachi (karachi_processed_network.gpkg)
Reading: santiago (santiago_processed_network.gpkg)
Reading: ciudad_de_tunez (ciudad_de_tunez_processed_network.gpkg)
Reading: melbourne (melbourne_processed_network.gpkg)
Reading: beijing (beijing_processed_network.gpkg)
Reading: moscow (moscow_processed_network.gpkg)
Reading: municipio_de_sao_paulo (municipio_de_sao_paulo_pr