In [1]:
# Geolocation Verification and Station Splitting

import pandas as pd
import numpy as np
import re
from pathlib import Path
import logging
import matplotlib.pyplot as plt
import folium
from typing import Tuple, List, Dict

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [2]:
YEAR = 1965
SIDE = "west"
DATA_DIR = Path('../data')

In [3]:
# Load the OpenRefine processed data
refined_data_path = f"../data/interim/stops_for_openrefine/unmatched_stops_{YEAR}_{SIDE}_refined.csv"
refined_stops = pd.read_csv(refined_data_path)
logger.info(f"Loaded {len(refined_stops)} stations from OpenRefine")

# Load the previously matched stops - this is where the original error was
original_stops = pd.read_csv(f'../data/interim/stops_matched_initial/stops_{YEAR}_{SIDE}.csv')
logger.info(f"Loaded {len(original_stops)} stations from original data")

# Also load line_stops to update later
line_stops_path = f"../data/interim/stops_base/line_stops_{YEAR}_{SIDE}.csv"
line_stops = pd.read_csv(line_stops_path)
logger.info(f"Loaded {len(line_stops)} line-stop relationships")


2025-02-28 01:07:30,243 - INFO - Loaded 528 stations from OpenRefine
2025-02-28 01:07:30,243 - INFO - Loaded 1006 stations from original data
2025-02-28 01:07:30,243 - INFO - Loaded 204 line-stop relationships


In [4]:
# 1. Geolocation Format Verification
def verify_geo_format(df: pd.DataFrame) -> pd.DataFrame:
    """Verify and standardize geolocation format."""
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Function to check and format location string
    def format_location(loc_str):
        if pd.isna(loc_str) or loc_str == '':
            return np.nan
            
        # Check if it contains multiple locations (with a hyphen)
        if ' - ' in loc_str:
            # This will be handled separately
            return loc_str
            
        # Remove any extra spaces
        loc_str = re.sub(r'\s+', '', loc_str)
        
        # Check if it's a valid coordinate pair
        pattern = r'^(-?\d+(\.\d+)?),(-?\d+(\.\d+)?)$'
        if re.match(pattern, loc_str):
            # Valid format, ensure consistent decimal places
            lat, lon = map(float, loc_str.split(','))
            return f"{lat:.8f},{lon:.8f}"
        else:
            logger.warning(f"Invalid coordinate format: {loc_str}")
            return np.nan
    
    # Apply formatting to location column
    df['location'] = df['location'].apply(format_location)
    
    # Count invalid formats
    invalid_count = df['location'].isna().sum()
    logger.info(f"Found {invalid_count} stations with invalid coordinate format")
    
    return df

In [5]:
# 2. Geographic Bounds Verification
def verify_geo_bounds(df: pd.DataFrame) -> pd.DataFrame:
    """Verify coordinates are within expected Berlin bounds."""
    # Berlin geographic bounds (approximate)
    BERLIN_BOUNDS = {
        'lat_min': 52.3,
        'lat_max': 52.7,
        'lon_min': 13.1,
        'lon_max': 13.8
    }
    
    df = df.copy()
    
    def check_bounds(loc_str):
        if pd.isna(loc_str) or loc_str == '':
            return False, "Missing coordinates"
            
        # Multiple locations case
        if ' - ' in loc_str:
            return True, "Multiple coordinates"
            
        try:
            lat, lon = map(float, loc_str.split(','))
            
            if (BERLIN_BOUNDS['lat_min'] <= lat <= BERLIN_BOUNDS['lat_max'] and
                BERLIN_BOUNDS['lon_min'] <= lon <= BERLIN_BOUNDS['lon_max']):
                return True, "Within bounds"
            else:
                return False, f"Outside Berlin bounds: {lat},{lon}"
        except:
            return False, "Invalid format"
    
    # Check bounds for all locations
    results = df['location'].apply(check_bounds)
    df['valid_bounds'] = results.apply(lambda x: x[0])
    df['bounds_message'] = results.apply(lambda x: x[1])
    
    # Log locations outside bounds
    outside_bounds = df[~df['valid_bounds']]
    if not outside_bounds.empty:
        logger.warning(f"Found {len(outside_bounds)} stations outside Berlin bounds:")
        for _, row in outside_bounds.iterrows():
            logger.warning(f"  - {row['stop_name']}: {row['bounds_message']}")
    
    return df

In [6]:
# 3. Handle Stations That Need Splitting
def split_combined_stations(df: pd.DataFrame, line_stops_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split rows where multiple stations are combined with hyphen.
    
    Args:
        df: DataFrame with stops data
        line_stops_df: DataFrame with line-stop relationships
        
    Returns:
        Tuple of (updated_stops_df, updated_line_stops_df)
    """
    df = df.copy()
    line_stops_df = line_stops_df.copy()
    
    # Find rows with combined stations
    combined_mask = df['location'].apply(lambda x: isinstance(x, str) and ' - ' in x)
    combined_stations = df[combined_mask].copy()
    
    if combined_stations.empty:
        logger.info("No combined stations found")
        return df, line_stops_df
        
    logger.info(f"Found {len(combined_stations)} combined stations to split")
    
    # Get the next available stop_id - Make sure all stop_ids are strings for consistency
    line_stops_df['stop_id'] = line_stops_df['stop_id'].astype(str)
    df['stop_id'] = df['stop_id'].astype(str)
    
    next_stop_id = int(df['stop_id'].str.replace(r'^\D*', '', regex=True).astype(int).max()) + 1
    
    # Process each combined station
    for idx, row in combined_stations.iterrows():
        # Split station names and locations
        stop_names = row['stop_name'].split(' - ')
        locations = row['location'].split(' - ')
        
        if len(stop_names) != len(locations):
            logger.warning(f"Mismatch between names and locations for {row['stop_name']}")
            continue
            
        # Create new entries for each split station
        original_stop_id = row['stop_id']
        
        # Update the first station in place
        df.at[idx, 'stop_name'] = stop_names[0]
        df.at[idx, 'location'] = locations[0]
        
        # Create new rows for additional stations
        for i in range(1, len(stop_names)):
            new_stop_id = f"{YEAR}{next_stop_id}"
            next_stop_id += 1
            
            # Create new row with same attributes but different name/location
            new_row = row.copy()
            new_row['stop_id'] = new_stop_id
            new_row['stop_name'] = stop_names[i]
            new_row['location'] = locations[i]
            
            # Add to dataframe
            df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
            
            # Update line_stops references
            # Find all references to the original stop_id
            line_refs = line_stops_df[line_stops_df['stop_id'] == original_stop_id]
            
            # Create additional line_stops entries for the new stop_id
            for _, line_ref in line_refs.iterrows():
                new_line_ref = line_ref.copy()
                new_line_ref['stop_id'] = new_stop_id  # This is now a string
                line_stops_df = pd.concat([line_stops_df, pd.DataFrame([new_line_ref])], ignore_index=True)
                
    return df, line_stops_df

In [7]:
# 4. Visualize stations on a map
def visualize_stations(df: pd.DataFrame, output_path: str):
    """Create a folium map with all stations."""
    # Filter to only valid locations - properly handle empty strings
    valid_df = df[(df['location'].notna()) & (df['location'] != '')].copy()
    
    # Extract coordinates with better error handling
    def extract_lat(loc_str):
        try:
            if not isinstance(loc_str, str) or loc_str == '':
                return np.nan
            parts = loc_str.split(',')
            if len(parts) != 2:
                return np.nan
            return float(parts[0].strip())
        except (ValueError, IndexError):
            return np.nan
    
    def extract_lon(loc_str):
        try:
            if not isinstance(loc_str, str) or loc_str == '':
                return np.nan
            parts = loc_str.split(',')
            if len(parts) != 2:
                return np.nan
            return float(parts[1].strip())
        except (ValueError, IndexError):
            return np.nan
    
    # Apply coordinate extraction
    valid_df['lat'] = valid_df['location'].apply(extract_lat)
    valid_df['lon'] = valid_df['location'].apply(extract_lon)
    
    # Filter out any rows with invalid coordinates
    valid_df = valid_df[(valid_df['lat'].notna()) & (valid_df['lon'].notna())]
    
    logger.info(f"Creating map with {len(valid_df)} stations")
    
    # Create map centered on Berlin
    m = folium.Map(location=[52.52, 13.40], zoom_start=12)
    
    # Define colors for different transport types
    type_colors = {
        'bus': 'blue',
        'strassenbahn': 'red',
        'u-bahn': 'green',
        's-bahn': 'purple'
    }
    
    # Add markers for each station
    for _, row in valid_df.iterrows():
        popup_text = f"{row['stop_name']} ({row['type']})<br>ID: {row['stop_id']}"
        color = type_colors.get(row['type'].lower(), 'gray')
        
        folium.Marker(
            [row['lat'], row['lon']],
            popup=popup_text,
            icon=folium.Icon(color=color)
        ).add_to(m)
    
    # Save map
    m.save(output_path)
    logger.info(f"Saved map to {output_path}")
    
    return m

In [8]:
# Run all verification steps
try:
    
    # Step 1: Split combined stations
    refined_stops, line_stops = split_combined_stations(refined_stops, line_stops)

    # Merge refined data with original stops based on stop_name, type, and line_name
    merged_stops = original_stops.copy()
    
    for idx, row in refined_stops.iterrows():
        stop_name = row['stop_name']
        stop_type = row['type']
        line_name = row['line_name']
        
        # Check if this stop exists in the original stops
        match = merged_stops[(merged_stops['stop_name'] == stop_name) & 
                             (merged_stops['type'] == stop_type) & 
                             (merged_stops['line_name'] == line_name)]
        
        if not match.empty:
            # Update location and identifier if match is found
            merged_idx = match.index[0]
            merged_stops.at[merged_idx, 'location'] = row['location']
            if 'identifier' in row and not pd.isna(row['identifier']):
                merged_stops.at[merged_idx, 'identifier'] = row['identifier']
        else:
            # This is a new stop, add to merged_stops
            merged_stops = pd.concat([merged_stops, pd.DataFrame([row])], ignore_index=True)
    
    # Save updated data
    verified_dir = DATA_DIR / 'interim' / 'stops_verified'
    verified_dir.mkdir(parents=True, exist_ok=True)

    # Step 2: Format verification
    merged_stops = verify_geo_format(merged_stops)
    
    # Step 3: Bounds verification
    merged_stops = verify_geo_bounds(merged_stops)

    # Step 4: Create visualization
    map_dir = DATA_DIR / 'visualizations'
    map_dir.mkdir(parents=True, exist_ok=True)
    visualize_stations(merged_stops, str(map_dir / f'stations_{YEAR}_{SIDE}.html'))
    
    merged_stops.to_csv(verified_dir / f'stops_{YEAR}_{SIDE}.csv', index=False)
    line_stops.to_csv(verified_dir / f'line_stops_{YEAR}_{SIDE}.csv', index=False)
    logger.info(f"Saved verified stops and line_stops")
    
    # Summary statistics
    valid_locations = merged_stops['location'].notna().sum()
    total_stops = len(merged_stops)
    print(f"\nVerification complete:")
    print(f"Total stations: {total_stops}")
    print(f"Valid locations: {valid_locations} ({valid_locations/total_stops*100:.1f}%)")
    print(f"Split stations: {len(merged_stops) - len(original_stops)}")
    
except Exception as e:
    logger.error(f"Error in verification: {e}")
    import traceback
    traceback.print_exc()
    raise

2025-02-28 01:07:30,289 - INFO - Found 4 combined stations to split
2025-02-28 01:07:30,573 - INFO - Found 0 stations with invalid coordinate format
2025-02-28 01:07:30,577 - INFO - Creating map with 1024 stations
2025-02-28 01:07:31,369 - INFO - Saved map to ..\data\visualizations\stations_1965_west.html
2025-02-28 01:07:31,375 - INFO - Saved verified stops and line_stops



Verification complete:
Total stations: 1024
Valid locations: 1024 (100.0%)
Split stations: 18
