In [None]:
# Berlin Transport Data Processing
## Setup and Imports

import sys
from pathlib import Path
import pandas as pd
import logging

# Add the src directory to the Python path
sys.path.append(str(Path('../src').resolve()))

# Import processing modules
from utils.data_loader import DataLoader, format_line_list
from processor import TransportDataProcessor
from geolocation import StationMatcher

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [2]:
# Configuration
YEAR = 1965
SIDE = "west"  # or "east"
DATA_DIR = Path('../data')

# Initialize loader
loader = DataLoader()

## Load Input Data

# Load raw transcribed data
raw_data_path = DATA_DIR / 'raw' / f'{YEAR}_{SIDE}.csv'
raw_df = loader.load_raw_data(str(raw_data_path))


logger.info(f"Loaded raw data: {len(raw_df)} lines")

2025-02-28 01:07:20,073 - INFO - Loaded raw data: 102 lines


In [3]:
# Display sample of loaded data to verify
print("\nSample of loaded data:")
print(raw_df[['line_name', 'type', 'stops']].head())


Sample of loaded data:
  line_name  type                                              stops
0        15  tram  Marienfelde, Daimlerstrasse - Großbeerenstrass...
1        47  tram  Gradestrasse Ecke Tempelhofer Weg - U-Bhf. Bla...
2       47P  tram  Groß-Ziethener-Chaussee Ecke Waltersdorferchau...
3        53  tram  Richard-Wagner-Platz - Luisenplatz - Klausener...
4        54  tram  Richard-Wagner-Platz - Luisenplatz - Klausener...


In [4]:
# Load existing stations data
# Load existing stations
existing_stations_path = DATA_DIR / 'processed' / 'existing_stations.csv'
existing_stations_df = pd.read_csv(existing_stations_path)

# Format line lists in existing stations
existing_stations_df['in_lines'] = existing_stations_df['in_lines'].apply(format_line_list)

logger.info(f"Loaded existing stations: {len(existing_stations_df)} stations")

2025-02-28 01:07:20,106 - INFO - Loaded existing stations: 3224 stations


In [5]:
# Process cleaned raw data

# Initialize processor with cleaned data
processor = TransportDataProcessor(YEAR, SIDE)

try:
    # Pass the DataFrame directly
    results = processor.process_raw_data(raw_df, existing_stations_df)
    logger.info("Initial processing complete")
    
    # Display processing results
    for name, df in results.items():
        print(f"\n{name} table shape: {df.shape}")
        print(f"Sample of {name}:")
        display(df.head(2))  # Using display for better notebook output
        
except Exception as e:
    logger.error(f"Error in initial processing: {e}")
    raise

2025-02-28 01:07:20,123 - INFO - Using provided DataFrame
2025-02-28 01:07:20,195 - INFO - Created tables: lines (102 rows), stops (1006 rows), line_stops (204 rows)
2025-02-28 01:07:20,195 - INFO - Initial processing complete



lines table shape: (102, 8)
Sample of lines:


Unnamed: 0,line_id,year,line_name,type,start_stop,length (time),east_west,frequency (7:30)
0,19651,1965,15,strassenbahn,"Marienfelde, Daimlerstrasse<> Schulenburgpark",36.0,west,10.0
1,19652,1965,47,strassenbahn,Gradestrasse Ecke Tempelhofer Weg<> Groß-Zieth...,21.0,west,10.0



stops table shape: (1006, 6)
Sample of stops:


Unnamed: 0,stop_name,type,line_name,stop_id,location,identifier
0,"Marienfelde, Daimlerstrasse",strassenbahn,15,19650,,
1,Großbeerenstrasse Ecke Daimlerstrasse,strassenbahn,15,19651,,



line_stops table shape: (204, 3)
Sample of line_stops:


Unnamed: 0,line_id,stop_id,stop_order
0,19651,19650,0
1,19651,196514,1


In [6]:
# Save results
for name, df in results.items():
    output_path = DATA_DIR / 'interim' / 'stops_base' / f'{name}_{YEAR}_{SIDE}.csv'
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    logger.info(f"Saved {name} table to {output_path}")

2025-02-28 01:07:20,223 - INFO - Saved lines table to ..\data\interim\stops_base\lines_1965_west.csv
2025-02-28 01:07:20,227 - INFO - Saved stops table to ..\data\interim\stops_base\stops_1965_west.csv
2025-02-28 01:07:20,229 - INFO - Saved line_stops table to ..\data\interim\stops_base\line_stops_1965_west.csv


In [7]:
# Station Matching Process

# Load the basic tables if not already in memory
if 'results' not in locals():
    base_dir = Path('../data/interim/stops_base')
    results = {
        'stops': pd.read_csv(base_dir / f'stops_{YEAR}_{SIDE}.csv'),
        'lines': pd.read_csv(base_dir / f'lines_{YEAR}_{SIDE}.csv'),
        'line_stops': pd.read_csv(base_dir / f'line_stops_{YEAR}_{SIDE}.csv')
    }

# Run matching process
matcher = StationMatcher(existing_stations_df)

# Process stops table with location matching
matched_stops = matcher.add_location_data(results['stops'])

2025-02-28 01:07:20,250 - INFO - No match found for station: Marienfelde, Daimlerstrasse
2025-02-28 01:07:20,256 - INFO - No match found for station: Großbeerenstrasse Ecke Daimlerstrasse
2025-02-28 01:07:20,261 - INFO - No match found for station: Mariendorferdamm Ecke Alt-Mariendorf
2025-02-28 01:07:20,263 - INFO - No match found for station: Imbrosweg Ecke Rixdorferstrasse
2025-02-28 01:07:20,263 - INFO - No match found for station: Mariendamm Friedhof
2025-02-28 01:07:20,263 - INFO - No match found for station: Industriestrasse Ecke Gottlieb-Dunkel-Strasse
2025-02-28 01:07:20,263 - INFO - No match found for station: Frauenklinik, Mariendorferweg
2025-02-28 01:07:20,263 - INFO - No match found for station: Hermannstrasse/Britzer Damm
2025-02-28 01:07:20,273 - INFO - No match found for station: Richardstrasse Ecke Braunschweigerstrasse
2025-02-28 01:07:20,280 - INFO - No match found for station: U-Bhf. Blaschkoallee
2025-02-28 01:07:20,280 - INFO - No match found for station: Buschkr

In [8]:
# Analysis of matching results
total_stops = len(matched_stops)
matched = matched_stops['location'].notna().sum()
unmatched = total_stops - matched

print("\nMatchinga Statistics:")
print(f"Total stations: {total_stops}")
print(f"Matched: {matched} ({matched/total_stops*100:.1f}%)")
print(f"Unmatched: {unmatched} ({unmatched/total_stops*100:.1f}%)")

# Display sample of matched stations
print("\nSample of matched stations:")
display(matched_stops[matched_stops['location'].notna()].head(3))

print("\nSample of unmatched stations:")
display(matched_stops[matched_stops['location'].isna()].head(3))


Matchinga Statistics:
Total stations: 1006
Matched: 518 (51.5%)
Unmatched: 488 (48.5%)

Sample of matched stations:


Unnamed: 0,stop_name,type,line_name,stop_id,location,identifier
2,Körtingstrasse Ecke Großbeerenstrasse,strassenbahn,15,19652,"52.434813531058246, 13.378315641328395",
7,Germaniastrasse Ecke Gottlieb-Dunkel-Strasse,strassenbahn,15,19657,"52.46053987437175, 13.418058888486561",
10,U-Bhf. Neukölln,strassenbahn,15,196510,"52.46965567921364, 13.441614939772098",



Sample of unmatched stations:


Unnamed: 0,stop_name,type,line_name,stop_id,location,identifier
0,"Marienfelde, Daimlerstrasse",strassenbahn,15,19650,,
1,Großbeerenstrasse Ecke Daimlerstrasse,strassenbahn,15,19651,,
3,Mariendorferdamm Ecke Alt-Mariendorf,strassenbahn,15,19653,,


In [None]:
# Validate matches
from geolocation import validate_matches

validate_matches(matched_stops)

# Save results
matched_dir = Path('../data/interim/stops_matched_initial')
matched_dir.mkdir(parents=True, exist_ok=True)

# Save all stops (both matched and unmatched)
matched_path = matched_dir / f'stops_{YEAR}_{SIDE}.csv'
matched_stops.to_csv(matched_path, index=False)

# Save unmatched stops separately for OpenRefine
unmatched_stops = matched_stops[matched_stops['location'].isna()]
openrefine_dir = Path('../data/interim/stops_for_openrefine')
openrefine_dir.mkdir(parents=True, exist_ok=True)
openrefine_path = openrefine_dir / f'unmatched_stops_{YEAR}_{SIDE}.csv'
unmatched_stops.to_csv(openrefine_path, index=False)

print(f"\nSaved {len(matched_stops)} total stops")
print(f"Exported {len(unmatched_stops)} unmatched stops for manual processing")


Matching Statistics:
Total stations: 1006
Matched: 518 (51.5%)
Unmatched: 488 (48.5%)

Sample of unmatched stations:
0              Marienfelde, Daimlerstrasse
1    Großbeerenstrasse Ecke Daimlerstrasse
3     Mariendorferdamm Ecke Alt-Mariendorf
4          Imbrosweg Ecke Rixdorferstrasse
5                      Mariendamm Friedhof
Name: stop_name, dtype: object

Saved 1006 total stops
Exported 488 unmatched stops for manual processing
