# Berlin Transport Network - Data Verification

This notebook verifies the processed Berlin transport network data to ensure data quality and consistency. It performs the following checks:

1. **Transport Types**: Verifies that all transport types are in the allowed set (autobus, omnibus, tram, u-bahn, s-bahn, ferry/fähre, strassenbahn)
2. **Stop Uniqueness**: Ensures each combination of stop_name, line_name, and year is unique
3. **Stop Connections**: Checks that each stop has at least one connection and no more than two connections
4. **Referential Integrity**: Verifies that all line_stops references point to valid lines and stops
5. **Geographic Data**: Ensures all stops have geographic coordinates

The verification logic is implemented in the `src.verification` module.

In [None]:
import sys
import pandas as pd
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Import verification module
sys.path.append('..')
from src import verification

In [None]:
# Configuration
YEAR = 1971
SIDE = "east"
BASE_DIR = Path('../data')

In [None]:
# Run verification
results = verification.run_verification(BASE_DIR, YEAR, SIDE)

In [None]:
# Generate and display report
report = verification.generate_verification_report(results, YEAR, SIDE)
print(report)

In [None]:
# Display check results in a more compact form
print("Verification Checks Results:")
print(f"- Transport Types: {'✅ PASSED' if results['transport_types']['valid'] else '❌ FAILED'}")
print(f"- Stop Uniqueness: {'✅ PASSED' if results['stop_uniqueness']['valid'] else '❌ FAILED'}")
print(f"- Stop Connections: {'✅ PASSED' if results['stop_connections']['valid'] else '❌ FAILED'}")
print(f"- Referential Integrity: {'✅ PASSED' if results['referential_integrity']['valid'] else '❌ FAILED'}")
print(f"- Geographic Data: {'✅ PASSED' if results['geographic_data']['valid'] else '❌ FAILED'}")

### 1. Transport Types Analysis

Analyze the distribution of transport types and verify they are all valid.

In [None]:
# Load the data again to analyze in more detail
data = verification.load_processed_data(BASE_DIR, YEAR, SIDE)
lines_df = data['lines']

# Display valid transport types
print(f"Valid transport types: {verification.VALID_TRANSPORT_TYPES}")
print()

# Analyze transport types
transport_counts = lines_df['type'].value_counts()
print("Transport types found in the data:")
for transport_type, count in transport_counts.items():
    print(f"- {transport_type}: {count} lines")

# Visualize transport type distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=transport_counts.index, y=transport_counts.values)
plt.title("Transport Type Distribution")
plt.xlabel("Transport Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 2. Stop Connections Analysis

Analyze the number of connections per stop.

In [None]:
# Load data
stops_df = data['stops']
line_stops_df = data['line_stops']

# Count connections per stop
connection_counts = line_stops_df['stop_id'].value_counts().reset_index()
connection_counts.columns = ['stop_id', 'connection_count']

# Get counts by connection number
one_connection = len(connection_counts[connection_counts['connection_count'] == 1])
two_connections = len(connection_counts[connection_counts['connection_count'] == 2])
more_connections = len(connection_counts[connection_counts['connection_count'] > 2])

# Count disconnected stops
all_stop_ids = set(stops_df['stop_id'])
connected_stop_ids = set(line_stops_df['stop_id'])
disconnected_count = len(all_stop_ids - connected_stop_ids)

print("Connection counts:")
print(f"- 1 connection: {one_connection} stops")
print(f"- 2 connections: {two_connections} stops")
print(f"- 3+ connections: {more_connections} stops")
print(f"- Disconnected stops: {disconnected_count} stops")

# Visualize connection distribution
connection_distribution = pd.Series({
    "1 connection": one_connection,
    "2 connections": two_connections,
    "3+ connections": more_connections,
    "Disconnected": disconnected_count
})

plt.figure(figsize=(10, 6))
sns.barplot(x=connection_distribution.index, y=connection_distribution.values)
plt.title("Stop Connection Distribution")
plt.xlabel("Connection Count")
plt.ylabel("Number of Stops")
plt.tight_layout()
plt.show()

### 3. Examine Issues (if any)

If verification failed, examine the specific issues in more detail.

In [None]:
# Check for duplicate stops
if not results['stop_uniqueness']['valid']:
    print("DUPLICATE STOPS:")
    display(results['stop_uniqueness']['duplicates'].head(10))
    print(f"Showing 10 of {len(results['stop_uniqueness']['duplicates'])} duplicate stops")
    print()

# Check for disconnected stops
if results['stop_connections']['disconnected_count'] > 0:
    print("DISCONNECTED STOPS:")
    display(results['stop_connections']['disconnected'].head(10))
    print(f"Showing 10 of {len(results['stop_connections']['disconnected'])} disconnected stops")
    print()

# Check for stops with too many connections
if results['stop_connections']['too_many_connections_count'] > 0:
    print("STOPS WITH TOO MANY CONNECTIONS:")
    display(results['stop_connections']['too_many_connections'].head(10))
    print(f"Showing 10 of {len(results['stop_connections']['too_many_connections'])} stops with too many connections")
    print()

# Check for invalid line references
if results['referential_integrity']['invalid_line_refs_count'] > 0:
    print("INVALID LINE REFERENCES:")
    display(results['referential_integrity']['invalid_line_refs'].head(10))
    print(f"Showing 10 of {len(results['referential_integrity']['invalid_line_refs'])} invalid line references")
    print()

# Check for invalid stop references
if results['referential_integrity']['invalid_stop_refs_count'] > 0:
    print("INVALID STOP REFERENCES:")
    display(results['referential_integrity']['invalid_stop_refs'].head(10))
    print(f"Showing 10 of {len(results['referential_integrity']['invalid_stop_refs'])} invalid stop references")
    print()

# Check for missing geographic data
if results['geographic_data']['missing_geo_count'] > 0:
    print("STOPS WITH MISSING GEOGRAPHIC DATA:")
    display(results['geographic_data']['missing_geo'].head(10))
    print(f"Showing 10 of {len(results['geographic_data']['missing_geo'])} stops with missing geographic data")
    print()
    
# If all checks passed
if results['overall']:
    print("✅ All verification checks passed! The data is valid and ready for analysis.")

In [None]:
# Cell 19 (Markdown):
"""
### 4. Station Distance Analysis

Analyze the distances between connected stations to identify potential geolocation issues.
"""

# Cell 20 (Code):
# Check if we have the enriched data with distances
if 'station_distances' in results:
    # Get distance check results
    too_close = results['station_distances'].get('too_close', pd.DataFrame())
    too_far = results['station_distances'].get('too_far', pd.DataFrame())
    
    # Load the line_stops with distances
    line_stops_with_dist = pd.read_csv(BASE_DIR / 'processed' / f"{YEAR}_{SIDE}" / "line_stops_with_dist.csv")
    
    # Summary statistics
    valid_distances = line_stops_with_dist[line_stops_with_dist['distance_meters'].notna()]
    
    if not valid_distances.empty:
        print("Distance Statistics:")
        print(f"- Total connections with valid distances: {len(valid_distances)}")
        print(f"- Average distance between stations: {valid_distances['distance_meters'].mean():.1f} meters")
        print(f"- Minimum distance: {valid_distances['distance_meters'].min():.1f} meters")
        print(f"- Maximum distance: {valid_distances['distance_meters'].max():.1f} meters")
        print()
        
        # Get transport type for each connection if not already included
        if 'transport_type' not in valid_distances.columns:
            # Load stops data to get transport types
            stops_df = pd.read_csv(BASE_DIR / 'processed' / f"{YEAR}_{SIDE}" / "stops.csv")
            
            # Join with line_stops to get transport type
            stops_type_dict = stops_df[['stop_id', 'type']].set_index('stop_id')['type'].to_dict()
            valid_distances['transport_type'] = valid_distances['stop_id'].map(stops_type_dict)
        
        # Group by transport type and calculate statistics
        print("Average distances by transport type:")
        transport_stats = valid_distances.groupby('transport_type')['distance_meters'].agg(
            ['count', 'mean', 'std', 'min', 'max']
        )
        
        # Format and display the results
        for transport_type, stats in transport_stats.iterrows():
            print(f"- {transport_type}:")
            print(f"  • Average distance: {stats['mean']:.1f} meters")
            print(f"  • Standard deviation: {stats['std']:.1f} meters")
            print(f"  • Range: {stats['min']:.1f} - {stats['max']:.1f} meters")
            print(f"  • Number of connections: {stats['count']}")
            print()
        
        # Visualize distance distribution
        plt.figure(figsize=(15, 10))
        
        # Plot histogram of distances
        plt.subplot(2, 2, 1)
        sns.histplot(valid_distances['distance_meters'], bins=20)
        plt.title("Distribution of Station Distances")
        plt.xlabel("Distance (meters)")
        plt.ylabel("Count")
        
        # Plot distances by transport type - boxplot
        plt.subplot(2, 2, 2)
        sns.boxplot(x='transport_type', y='distance_meters', data=valid_distances)
        plt.title("Distances by Transport Type (Boxplot)")
        plt.xlabel("Transport Type")
        plt.ylabel("Distance (meters)")
        plt.xticks(rotation=45)
        
        # Plot distances by transport type - violin plot
        plt.subplot(2, 2, 3)
        sns.violinplot(x='transport_type', y='distance_meters', data=valid_distances)
        plt.title("Distances by Transport Type (Violin Plot)")
        plt.xlabel("Transport Type")
        plt.ylabel("Distance (meters)")
        plt.xticks(rotation=45)
        
        # Plot average distances by transport type - bar plot
        plt.subplot(2, 2, 4)
        transport_means = valid_distances.groupby('transport_type')['distance_meters'].mean().reset_index()
        sns.barplot(x='transport_type', y='distance_meters', data=transport_means)
        plt.title("Average Distance by Transport Type")
        plt.xlabel("Transport Type")
        plt.ylabel("Average Distance (meters)")
        plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        # Show problematic connections
        if not too_close.empty or not too_far.empty:
            print("\nProblematic Connections:")
            
            if not too_close.empty:
                print("\nStations that are too close (<200m):")
                display(too_close[['stop_id', 'stop_name', 'line_id', 'distance_meters', 'transport_type']].head(10))
                if len(too_close) > 10:
                    print(f"(Showing 10 of {len(too_close)} connections that are too close)")
            
            if not too_far.empty:
                print("\nStations that are too far apart:")
                display(too_far[['stop_id', 'stop_name', 'line_id', 'distance_meters', 'transport_type', 'max_allowed_distance']].head(10))
                if len(too_far) > 10:
                    print(f"(Showing 10 of {len(too_far)} connections that are too far)")
    else:
        print("No valid distances between stations were calculated.")
else:
    print("Station distance verification results not available.")