In [None]:
# Berlin Transport Data Processing
## Setup and Imports

import sys
from pathlib import Path
import pandas as pd
import logging

# Add src directory to Python path
src_path = str(Path.cwd().parent / 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

# Import processing modules
from data_loader import DataLoader, format_line_list
from processor import TransportDataProcessor
from geolocation import StationMatcher

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


In [None]:
# Configuration
YEAR = 1965
SIDE = "west"  # or "east"
DATA_DIR = Path('../data')

# Initialize loader
loader = DataLoader()

## Load Input Data

# Load raw transcribed data
raw_data_path = DATA_DIR / 'raw' / f'{YEAR}_{SIDE}.csv'
raw_df = loader.load_raw_data(str(raw_data_path))


logger.info(f"Loaded raw data: {len(raw_df)} lines")

In [None]:
# Display sample of loaded data to verify
print("\nSample of loaded data:")
print(raw_df[['line_name', 'type', 'stops']].head())

In [None]:
# Load existing stations data
# Load existing stations
existing_stations_path = DATA_DIR / 'processed' / 'existing_stations.csv'
existing_stations_df = pd.read_csv(existing_stations_path)

# Format line lists in existing stations
existing_stations_df['in_lines'] = existing_stations_df['in_lines'].apply(format_line_list)

logger.info(f"Loaded existing stations: {len(existing_stations_df)} stations")

In [None]:
# Process cleaned raw data

# Initialize processor with cleaned data
processor = TransportDataProcessor(YEAR, SIDE)

try:
    # Pass the DataFrame directly
    results = processor.process_raw_data(raw_df, existing_stations_df)
    logger.info("Initial processing complete")
    
    # Display processing results
    for name, df in results.items():
        print(f"\n{name} table shape: {df.shape}")
        print(f"Sample of {name}:")
        display(df.head(2))  # Using display for better notebook output
        
except Exception as e:
    logger.error(f"Error in initial processing: {e}")
    raise

In [None]:
# Save results
for name, df in results.items():
    output_path = DATA_DIR / 'interim' / 'stops_base' / f'{name}_{YEAR}_{SIDE}.csv'
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    logger.info(f"Saved {name} table to {output_path}")

In [None]:
# Station Matching Process

# Load the basic tables if not already in memory
if 'results' not in locals():
    base_dir = Path('../data/interim/stops_base')
    results = {
        'stops': pd.read_csv(base_dir / f'stops_{YEAR}_{SIDE}.csv'),
        'lines': pd.read_csv(base_dir / f'lines_{YEAR}_{SIDE}.csv'),
        'line_stops': pd.read_csv(base_dir / f'line_stops_{YEAR}_{SIDE}.csv')
    }

# Run matching process
matcher = StationMatcher(existing_stations_df)

# Process stops table with location matching
matched_stops = matcher.add_location_data(results['stops'])

In [None]:
# Analysis of matching results
total_stops = len(matched_stops)
matched = matched_stops['location'].notna().sum()
unmatched = total_stops - matched

print("\nMatchinga Statistics:")
print(f"Total stations: {total_stops}")
print(f"Matched: {matched} ({matched/total_stops*100:.1f}%)")
print(f"Unmatched: {unmatched} ({unmatched/total_stops*100:.1f}%)")

# Display sample of matched stations
print("\nSample of matched stations:")
display(matched_stops[matched_stops['location'].notna()].head(3))

print("\nSample of unmatched stations:")
display(matched_stops[matched_stops['location'].isna()].head(3))

In [None]:
# Validate matches
from geolocation import validate_matches

validate_matches(matched_stops)

# Save results
matched_dir = Path('../data/interim/stops_matched')
matched_dir.mkdir(parents=True, exist_ok=True)

# Save all stops (both matched and unmatched)
matched_path = matched_dir / f'stops_{YEAR}_{SIDE}.csv'
matched_stops.to_csv(matched_path, index=False)

# Save unmatched stops separately for OpenRefine
unmatched_stops = matched_stops[matched_stops['location'].isna()]
openrefine_dir = Path('../data/interim/stops_for_openrefine')
openrefine_dir.mkdir(parents=True, exist_ok=True)
openrefine_path = openrefine_dir / f'unmatched_stops_{YEAR}_{SIDE}.csv'
unmatched_stops.to_csv(openrefine_path, index=False)

print(f"\nSaved {len(matched_stops)} total stops")
print(f"Exported {len(unmatched_stops)} unmatched stops for manual processing")