In [1]:
# Berlin Transport Data Processing
## Setup and Imports

import sys
from pathlib import Path
import pandas as pd
import logging

# Add the src directory to the Python path
sys.path.append(str(Path('../src').resolve()))

# Import processing modules
from utils.data_loader import DataLoader, format_line_list
from processor import TransportDataProcessor
from utils.geolocation import StationMatcher

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [2]:
# Configuration
YEAR = 1965
SIDE = "west"  # or "east"
DATA_DIR = Path('../data')

# Initialize loader
loader = DataLoader()

## Load Input Data

# Load raw transcribed data
raw_data_path = DATA_DIR / 'raw' / f'{YEAR}_{SIDE}.csv'
raw_df = loader.load_raw_data(str(raw_data_path))


logger.info(f"Loaded raw data: {len(raw_df)} lines")

2025-02-28 20:49:11,522 - INFO - Loaded raw data: 102 lines


In [3]:
# Display sample of loaded data to verify
print("\nSample of loaded data:")
print(raw_df[['line_name', 'type', 'stops']].head())


Sample of loaded data:
  line_name  type                                              stops
0        15  tram  Marienfelde, Daimlerstrasse - Großbeerenstrass...
1        47  tram  Gradestrasse Ecke Tempelhofer Weg - U-Bhf. Bla...
2       47P  tram  Groß-Ziethener-Chaussee Ecke Waltersdorferchau...
3        53  tram  Richard-Wagner-Platz - Luisenplatz - Klausener...
4        54  tram  Richard-Wagner-Platz - Luisenplatz - Klausener...


In [4]:
# Load existing stations data
# Load existing stations
existing_stations_path = DATA_DIR / 'processed' / 'existing_stations.csv'
existing_stations_df = pd.read_csv(existing_stations_path)

# Format line lists in existing stations
existing_stations_df['in_lines'] = existing_stations_df['in_lines'].apply(format_line_list)

logger.info(f"Loaded existing stations: {len(existing_stations_df)} stations")

2025-02-28 20:49:11,656 - INFO - Loaded existing stations: 1024 stations


In [5]:
# Process cleaned raw data

# Initialize processor with cleaned data
processor = TransportDataProcessor(YEAR, SIDE)

try:
    # Pass the DataFrame directly
    results = processor.process_raw_data(raw_df, existing_stations_df)
    logger.info("Initial processing complete")
    
    # Display processing results
    for name, df in results.items():
        print(f"\n{name} table shape: {df.shape}")
        print(f"Sample of {name}:")
        display(df.head(2))  # Using display for better notebook output
        
except Exception as e:
    logger.error(f"Error in initial processing: {e}")
    raise

2025-02-28 20:49:11,700 - INFO - Using provided DataFrame
2025-02-28 20:49:11,721 - INFO - Created tables: lines (102 rows), stops (1006 rows), 
2025-02-28 20:49:11,722 - INFO - Initial processing complete



lines table shape: (102, 9)
Sample of lines:


Unnamed: 0,line_id,year,line_name,type,start_stop,length (time),length (km),east_west,frequency (7:30)
0,19651,1965,15,tram,"Marienfelde, Daimlerstrasse<> Schulenburgpark",36.0,,west,10.0
1,19652,1965,47,tram,Gradestrasse Ecke Tempelhofer Weg<> Groß-Zieth...,21.0,,west,10.0



stops table shape: (1006, 6)
Sample of stops:


Unnamed: 0,stop_name,type,line_name,stop_id,location,identifier
0,"Marienfelde, Daimlerstrasse",tram,15,19650,,
1,Großbeerenstrasse Ecke Daimlerstrasse,tram,15,19651,,


In [6]:
# Save results
for name, df in results.items():
    output_path = DATA_DIR / 'interim' / 'stops_base' / f'{name}_{YEAR}_{SIDE}.csv'
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    logger.info(f"Saved {name} table to {output_path}")

2025-02-28 20:49:11,763 - INFO - Saved lines table to ..\data\interim\stops_base\lines_1965_west.csv
2025-02-28 20:49:11,765 - INFO - Saved stops table to ..\data\interim\stops_base\stops_1965_west.csv


In [7]:
# Station Matching Process

# Load the basic tables if not already in memory
if 'results' not in locals():
    base_dir = Path('../data/interim/stops_base')
    results = {
        'stops': pd.read_csv(base_dir / f'stops_{YEAR}_{SIDE}.csv'),
        'lines': pd.read_csv(base_dir / f'lines_{YEAR}_{SIDE}.csv'),
    }

# Run matching process
matcher = StationMatcher(existing_stations_df)

# Process stops table with location matching
matched_stops = matcher.add_location_data(results['stops'])

2025-02-28 20:49:14,402 - INFO - Successfully matched 1006 out of 1006 stations


In [8]:
# Analysis of matching results
total_stops = len(matched_stops)
matched = matched_stops['location'].notna().sum()
unmatched = total_stops - matched

print("\nMatchinga Statistics:")
print(f"Total stations: {total_stops}")
print(f"Matched: {matched} ({matched/total_stops*100:.1f}%)")
print(f"Unmatched: {unmatched} ({unmatched/total_stops*100:.1f}%)")

# Display sample of matched stations
print("\nSample of matched stations:")
display(matched_stops[matched_stops['location'].notna()].head(3))

print("\nSample of unmatched stations:")
display(matched_stops[matched_stops['location'].isna()].head(3))


Matchinga Statistics:
Total stations: 1006
Matched: 1006 (100.0%)
Unmatched: 0 (0.0%)

Sample of matched stations:


Unnamed: 0,stop_name,type,line_name,stop_id,location,identifier
0,"Marienfelde, Daimlerstrasse",tram,15,19650,"52.42393712,13.38022295",
1,Großbeerenstrasse Ecke Daimlerstrasse,tram,15,19651,"52.42636276,13.37438168",
2,Körtingstrasse Ecke Großbeerenstrasse,tram,15,19652,"52.43481353,13.37831564",



Sample of unmatched stations:


Unnamed: 0,stop_name,type,line_name,stop_id,location,identifier


In [9]:
# Validate matches
from utils.geolocation import validate_matches

validate_matches(matched_stops)

# Save results
matched_dir = Path('../data/interim/stops_matched_initial')
matched_dir.mkdir(parents=True, exist_ok=True)

# Save all stops (both matched and unmatched)
matched_path = matched_dir / f'stops_{YEAR}_{SIDE}.csv'
matched_stops.to_csv(matched_path, index=False)

# Save unmatched stops separately for OpenRefine
unmatched_stops = matched_stops[matched_stops['location'].isna()]
openrefine_dir = Path('../data/interim/stops_for_openrefine')
openrefine_dir.mkdir(parents=True, exist_ok=True)
openrefine_path = openrefine_dir / f'unmatched_stops_{YEAR}_{SIDE}.csv'
unmatched_stops.to_csv(openrefine_path, index=False)

print(f"\nSaved {len(matched_stops)} total stops")
print(f"Exported {len(unmatched_stops)} unmatched stops for manual processing")


Matching Statistics:
Total stations: 1006
Matched: 1006 (100.0%)
Unmatched: 0 (0.0%)

Sample of unmatched stations:
Series([], Name: stop_name, dtype: object)

Saved 1006 total stops
Exported 0 unmatched stops for manual processing
