In [1]:
import pandas as pd
import numpy as np
from pyproj import Transformer
import osmnx as ox
import geopandas as gpd
from typing import Dict, Optional,List, Tuple

from shapely.geometry import Point, Polygon, MultiPolygon
from typing import Union, Optional



In [2]:

def get_osm_bbox(
    df: pd.DataFrame,
    from_epsg: int = 32632,
    buffer_percent: float = 0.01,
    transform_coords: bool = True
) -> Dict[str, float]:
    """
    Calculate a bounding box (bbox) for OpenStreetMap (OSM) queries from a DataFrame containing SMS trips'
    latitude and longitude coordinates, with an optional buffer zone.

    The function processes departure and arrival coordinates to find the minimum and maximum
    extents, adds a buffer zone, and returns the bbox in a format suitable for OSM queries.
    The coordinates are expected to be in the specified EPSG coordinate system.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing the following columns:
        - departure_longitude: Longitude coordinates of departure points
        - departure_latitude: Latitude coordinates of departure points
        - arrival_longitude: Longitude coordinates of arrival points
        - arrival_latitude: Latitude coordinates of arrival points
        All coordinates should be in the specified input coordinate system (from_epsg)

    from_epsg : int, optional
        EPSG code of the input coordinate system (default: 32632 for UTM zone 32N)
        Common values:
        - 32632: UTM zone 32N
        - 4326: WGS84 (standard GPS coordinates)

    buffer_percent : float, optional
        Percentage of the coordinate range to add as buffer (default: 0.01 for 1%)
        Must be between 0 and 1

    transform_coords : bool, optional
        Whether to transform coordinates from input EPSG to WGS84 (default: True)
        Set to False if coordinates are already in WGS84

    Returns
    -------
    dict
        Dictionary containing the bounding box coordinates:
        - min_lat: Minimum latitude (southern boundary)
        - max_lat: Maximum latitude (northern boundary)
        - min_lon: Minimum longitude (western boundary)
        - max_lon: Maximum longitude (eastern boundary)

    Raises
    ------
    ValueError
        If required columns are missing from DataFrame
        If buffer_percent is not between 0 and 1
        If coordinates are outside valid ranges
        If DataFrame is empty

    Example
    -------
    >>> df = pd.DataFrame({
    ...     'departure_longitude': [10.0, 10.5],
    ...     'departure_latitude': [50.0, 50.5],
    ...     'arrival_longitude': [11.0, 11.5],
    ...     'arrival_latitude': [51.0, 51.5]
    ... })
    >>> bbox = get_osm_bbox(df, from_epsg=4326)
    >>> print(bbox)
    {
        'min_lat': 49.95,
        'max_lat': 51.55,
        'min_lon': 9.95,
        'max_lon': 11.55
    }
    """
    # Validate input parameters
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame")
    
    if df.empty:
        raise ValueError("DataFrame is empty")

    required_columns = [
        'departure_longitude', 'departure_latitude',
        'arrival_longitude', 'arrival_latitude'
    ]
    
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    if not 0 <= buffer_percent <= 1:
        raise ValueError("buffer_percent must be between 0 and 1")

    # Create transformer if needed
    if transform_coords:
        transformer = Transformer.from_crs(f"EPSG:{from_epsg}", "EPSG:4326", always_xy=True)
    
    # Get all x and y coordinates
    x_coords = np.concatenate([
        df['departure_longitude'].values,
        df['arrival_longitude'].values
    ])
    y_coords = np.concatenate([
        df['departure_latitude'].values,
        df['arrival_latitude'].values
    ])
    
    # Check for invalid coordinates
    if transform_coords:
        # For projected coordinates, checks will depend on the specific projection
        pass
    else:
        # For WGS84, check latitude and longitude ranges
        if not (-180 <= x_coords).all() or not (x_coords <= 180).all():
            raise ValueError("Longitude values must be between -180 and 180 degrees")
        if not (-90 <= y_coords).all() or not (y_coords <= 90).all():
            raise ValueError("Latitude values must be between -90 and 90 degrees")
    
    # Calculate bbox
    min_x = np.min(x_coords)
    max_x = np.max(x_coords)
    min_y = np.min(y_coords)
    max_y = np.max(y_coords)
    
    # Add buffer
    x_buffer = (max_x - min_x) * buffer_percent
    y_buffer = (max_y - min_y) * buffer_percent
    
    min_x -= x_buffer
    max_x += x_buffer
    min_y -= y_buffer
    max_y += y_buffer
    
    # Transform coordinates if needed
    if transform_coords:
        min_x, min_y = transformer.transform(min_x, min_y)
        max_x, max_y = transformer.transform(max_x, max_y)
    
    # Ensure transformed coordinates are within valid ranges for WGS84
    min_x = max(-180, min(180, min_x))
    max_x = max(-180, min(180, max_x))
    min_y = max(-90, min(90, min_y))
    max_y = max(-90, min(90, max_y))
    
    # Create bbox dict
    bbox_dict = {
        'min_lat': min_y,
        'max_lat': max_y,
        'min_lon': min_x,
        'max_lon': max_x
    }
    
    return bbox_dict


def validate_bbox(bbox: Dict[str, float]) -> bool:
    """
    Validate a bounding box dictionary to ensure coordinates are within valid ranges.

    Parameters
    ----------
    bbox : dict
        Dictionary containing min_lat, max_lat, min_lon, max_lon keys

    Returns
    -------
    bool
        True if bbox is valid, False otherwise
    """
    try:
        # Check if all required keys exist
        required_keys = ['min_lat', 'max_lat', 'min_lon', 'max_lon']
        if not all(key in bbox for key in required_keys):
            return False
        
        # Check latitude ranges
        if not (-90 <= bbox['min_lat'] <= bbox['max_lat'] <= 90):
            return False
        
        # Check longitude ranges
        if not (-180 <= bbox['min_lon'] <= bbox['max_lon'] <= 180):
            return False
        
        return True
    except (KeyError, TypeError):
        return False

In [4]:
legs_input_path = "SMS_trips.csv"
trips= pd.read_csv(legs_input_path)

In [11]:
bbox= get_osm_bbox(trips)
bbox

{'min_lat': 48.85222597,
 'max_lat': 48.92658703,
 'min_lon': 1.825876142,
 'max_lon': 2.052240458}

## Extract opportunities using OSM

In [7]:
# Constants for OpenStreetMap feature tags
AMENITY_TAGS: List[str] = [
    # Transportation
    "bus_stop",
    "stop_position",
    "platform",
    "station",
    "stop_area",
    "stop_area_group",
    "car_sharing",
    
    # Education
    "college",
    "kindergarten",
    "library",
    "school",
    "research_institute",
    
    # Healthcare
    "clinic",
    "doctors",
    "dentist",
    "pharmacy",
    "veterinary",
    
    # Social and Community
    "social_facility",
    "community_centre",
    "social_centre",
    
    # Entertainment
    "cinema",
    "theatre",
    
    # Commercial
    "market_place",
    
    # Business/Office
    "office",              
    "bank",               
    "insurance",          
    "company",            
    "consulting",         
    "estate_agent",       
    "government",         
    "lawyer",             
    "tax_advisor",        
    "telecommunication",  
    "travel_agent",       
    "coworking_space",    
]

OFFICE_TAGS: Dict[str, List[str]] = {
    "office": [
        "accountant",
        "advertising_agency",
        "architect",
        "association",
        "company",
        "consulting",
        "coworking",
        "educational_institution",
        "employment_agency",
        "engineering",
        "estate_agent",
        "financial",
        "foundation",
        "government",
        "insurance",
        "it",
        "lawyer",
        "ngo",
        "notary",
        "physician",
        "research",
        "software",
        "telecommunication",
        "travel_agent"
    ]
}

SHOP_TAGS: Dict[str, List[str]] = {
    "shop": [
        "supermarket",
        "department_store",
        "mall",
        "convenience",
        "clothes",
        "electronics",
        "hardware",
        "furniture"
    ]
}

INDUSTRIAL_TAGS: Dict[str, List[str]] = {
    "landuse": [
        "industrial",
        "commercial",
        "retail",
        "office",
        "business_park"
    ]
}

def fetch_osm_features(
    bounding_box: Dict[str, float],
    custom_tags: Dict[str, List[str]] = None
) -> gpd.GeoDataFrame:
    """
    Fetch OpenStreetMap features within a specified bounding box using OSMnx.
    
    Parameters
    ----------
    bounding_box : dict
        Dictionary containing the bounding box coordinates:
        - min_lat: Minimum latitude (south)
        - max_lat: Maximum latitude (north)
        - min_lon: Minimum longitude (west)
        - max_lon: Maximum longitude (east)
    custom_tags : dict, optional
        Custom tags to override default tag configuration
        
    Returns
    -------
    geopandas.GeoDataFrame
        GeoDataFrame containing the retrieved features with their geometries
        
    Example
    -------
    >>> bbox = {
    ...     'min_lat': 48.85222597,
    ...     'max_lat': 48.92658703,
    ...     'min_lon': 1.825876142,
    ...     'max_lon': 2.052240458
    ... }
    >>> gdf = fetch_osm_features(bbox)
    """
    # Configure OSMnx settings
    ox.settings.use_cache = True
    ox.settings.log_console = True
    
    # Convert bounding box to tuple format required by OSMnx
    bbox_tuple = (
        bounding_box['min_lat'],
        bounding_box['max_lat'],
        bounding_box['min_lon'],
        bounding_box['max_lon']
    )
    
    # Define default tags if custom tags not provided
    if custom_tags is None:
        tags = {
            "amenity": AMENITY_TAGS,
            "office": OFFICE_TAGS["office"],
            "shop": SHOP_TAGS["shop"],
            "landuse": INDUSTRIAL_TAGS["landuse"]
        }
    else:
        tags = custom_tags
    
    # Retrieve geographical features
    geo_df = ox.geometries.geometries_from_bbox(*bbox_tuple, tags)
    
    # Convert to GeoDataFrame with specified CRS
    gdf = gpd.GeoDataFrame(
        geo_df,
        geometry=geo_df['geometry'],
        crs="EPSG:4326"
    )
    
    return gdf

def save_features_shapefile(
    gdf: gpd.GeoDataFrame,
    filename: str = "osm_features.shp"
) -> None:
    """
    Save a GeoDataFrame to a shapefile.
    
    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        GeoDataFrame containing the features to save
    filename : str, optional
        Name of the output shapefile (default: "osm_features.shp")
    """
    gdf.to_file(filename)
    print(f"Shapefile has been saved as {filename}")

def main():
    """
    Main function to demonstrate the usage of the OSM feature extraction pipeline.
    """
    # Define the bounding box (example for greater Paris)
    bounding_box = {'min_lat': 48.85222597,
 'max_lat': 48.92658703,
 'min_lon': 1.825876142,
 'max_lon': 2.052240458}
    
    # Fetch features
    gdf = fetch_osm_features(bounding_box)
    
    
    return gdf

if __name__ == "__main__":
    gdf = main()

  geo_df = ox.geometries.geometries_from_bbox(*bbox_tuple, tags)
  return features.features_from_bbox(north, south, east, west, tags=tags)
  return features.features_from_bbox(north, south, east, west, tags=tags)


In [9]:


def process_geodataframe(
    gdf: gpd.GeoDataFrame,
    selected_columns: list = ["amenity", "name", "geometry"],
    output_path: Optional[str] = None
) -> gpd.GeoDataFrame:
    """
    Process a GeoDataFrame by adding IDs, removing duplicates, converting polygons 
    to centroids, and optionally saving to a shapefile.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        Input GeoDataFrame containing spatial features
    selected_columns : list, optional
        List of columns to keep in the final DataFrame
        Default: ["amenity", "name", "geometry"]
    output_path : str, optional
        Path to save the output shapefile
        If None, the file won't be saved

    Returns
    -------
    geopandas.GeoDataFrame
        Processed GeoDataFrame with unique features and centroids

    Notes
    -----
    The function performs the following operations:
    1. Resets the index and adds sequential IDs
    2. Removes duplicate entries based on name and geometry
    3. Converts polygons and multipolygons to centroids
    4. Removes LineString geometries
    5. Adds a population field (set to 1.0)
    """
    # Reset index and create sequential IDs
    df_reset = gdf.reset_index()
    
    # Remove duplicates based on name and geometry
    if 'name' in df_reset.columns:
        df_reset = df_reset.drop_duplicates(subset=['name', 'geometry'])
    else:
        df_reset = df_reset.drop_duplicates(subset=['geometry'])
    
    # Create new sequential IDs after removing duplicates
    df_reset['id'] = range(1, len(df_reset) + 1)
    
    # Select required columns
    columns = ["id"] + selected_columns
    df_reset = df_reset[columns]
    
    # Convert back to GeoDataFrame
    gdf_processed = gpd.GeoDataFrame(df_reset, geometry='geometry')
    
    # Convert polygons/multipolygons to centroids
    gdf_processed['geometry'] = gdf_processed['geometry'].apply(_to_centroid)
    
    # assign each opp a number 1 for computation after
    gdf_processed['pop'] = 1.0
    
    # Remove LineString geometries
    gdf_processed = gdf_processed[
        gdf_processed.geometry.apply(lambda geom: geom.geom_type != 'LineString')
    ]
    
    # Save to file if path is provided
    if output_path:
        try:
            gdf_processed.to_file(output_path)
            print(f"Successfully saved to {output_path}")
        except Exception as e:
            print(f"Error saving file: {str(e)}")
    
    return gdf_processed

def _to_centroid(geometry: Union[Point, Polygon, MultiPolygon]) -> Point:
    """
    Convert polygon or multipolygon geometry to its centroid.
    Points remain unchanged.

    Parameters
    ----------
    geometry : Union[Point, Polygon, MultiPolygon]
        Input geometry to be converted

    Returns
    -------
    Point
        Centroid of the input geometry or original point
    """
    if isinstance(geometry, (Polygon, MultiPolygon)):
        return geometry.centroid
    return geometry

def analyze_duplicates(gdf: gpd.GeoDataFrame) -> pd.DataFrame:
    """
    Analyze duplicate entries in the GeoDataFrame.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        Input GeoDataFrame to analyze

    Returns
    -------
    pandas.DataFrame
        Summary of duplicate entries
    """
    # Check duplicates based on different criteria
    name_dupes = gdf.duplicated(subset=['name'], keep=False).sum() if 'name' in gdf.columns else 0
    geom_dupes = gdf.duplicated(subset=['geometry'], keep=False).sum()
    both_dupes = (gdf.duplicated(subset=['name', 'geometry'], keep=False).sum() 
                 if 'name' in gdf.columns else 0)
    
    return pd.DataFrame({
        'Criteria': ['Name only', 'Geometry only', 'Name and Geometry'],
        'Duplicate Count': [name_dupes, geom_dupes, both_dupes]
    })

# Example usage
if __name__ == "__main__":
    
    
    output_path = "../Cities/Padam_terretory_01/shp files/opps_for_padam_terretory_01.shp"
    
    
    
    # Analyze duplicates before processing
    print("Duplicate Analysis Before Processing:")
    print(analyze_duplicates(gdf))
    
    # Process the GeoDataFrame
    processed_gdf = process_geodataframe(
        gdf,
        selected_columns=["amenity", "name", "geometry"],
        output_path=output_path
    )
    
    # Analyze duplicates after processing
    print("\nDuplicate Analysis After Processing:")
    print(analyze_duplicates(processed_gdf))
    
    # Print summary statistics
    print("\nSummary Statistics:")
    print(f"Original features: {len(gdf)}")
    print(f"Processed features: {len(processed_gdf)}")

Duplicate Analysis Before Processing:
            Criteria  Duplicate Count
0          Name only               81
1      Geometry only                0
2  Name and Geometry                0
Successfully saved to ../Cities/Padam_terretory_01/shp files/opps_for_padam_terretory_01.shp

Duplicate Analysis After Processing:
            Criteria  Duplicate Count
0          Name only               81
1      Geometry only                0
2  Name and Geometry                0

Summary Statistics:
Original features: 251
Processed features: 251


In [11]:
# Basic usage
processed_gdf = process_geodataframe(gdf)

# Advanced usage with custom columns and output path
processed_gdf = process_geodataframe(
    gdf,
    selected_columns=["amenity", "name", "geometry"],
    output_path="output.shp"
)

# Analyze duplicates in your data
duplicate_analysis = analyze_duplicates(gdf)
print(duplicate_analysis)

Successfully saved to output.shp
            Criteria  Duplicate Count
0          Name only               81
1      Geometry only                0
2  Name and Geometry                0


In [12]:
processed_gdf

Unnamed: 0,id,amenity,name,geometry,pop
0,1,pharmacy,Pharmacie de l'Église,POINT (1.97643 48.91973),1.0
1,2,,Franprix,POINT (2.05223 48.92607),1.0
2,3,school,École élémentaire Louis Pasteur,POINT (2.02192 48.86020),1.0
3,4,school,École maternelle Jean de La Fontaine,POINT (2.02285 48.86054),1.0
4,5,bank,,POINT (1.97564 48.92002),1.0
...,...,...,...,...,...
246,247,,,POINT (1.95081 48.91590),1.0
247,248,school,,POINT (2.05174 48.89888),1.0
248,249,school,Groupe Scolaire Roger Gousseau,POINT (1.92272 48.91099),1.0
249,250,kindergarten,,POINT (2.02059 48.92092),1.0
