In [1]:
import sys
import pickle
from route_planner_omar import * 
from datetime import timedelta, datetime

MODULES_PATH = "../../modules"
if MODULES_PATH not in sys.path:
    sys.path.append(MODULES_PATH)
    
from hive_wrapper import *
from database import *
from utils import *
from route_planner import *
from validation_utils import *
from visualization import *

%reload_ext autoreload
%autoreload 2

In [2]:
%reload_ext sparkmagic.magics

Cleaning up livy sessions on exit is enabled


In [3]:
username, _, _ = get_env_vars()

In [4]:
setup_spark(username)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
5614,application_1680948035106_5110,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


# Validation

## Simple examples 
We are first going to check the viability of our route planner based on a few frequent trips that a Zürich citizen might take :

**Start** : Zürich HB   
**End** : Zürich Auzelg   
**Deadline Date** : 29/05/2023  16:30:00  
**Path picking condition** : Path with the latest departure time

In [13]:
# TOY EXAMPLE : zürich hb -> zürich, auzelg
def plan_route(deadline_hr, source_id, source_name, target_id, target_name):
    
    deadline_datetime = datetime.strptime(deadline_hr, "%H:%M:%S")

    rp = RoutePlanner(graph, source_id, target_id, deadline=deadline_datetime)
    routes = rp.find_paths(1)

    stop_names_rp = {source_name, target_name}
    for route in routes:
        for edge in route:
            print(edge)
            stop_names_rp.add(edge[2]["stop_name"])
            stop_names_rp.add(edge[2]["next_stop_name"])
        print()

    print("---------------")
    #print("Route planner stops", stop_names_rp, "\n")
    return stop_names_rp

deadline_hr = "16:30:00"
source_id = "8503000"
target_id = "8591049"
source_rp = "zürich hb"
target_rp = "zürich, auzelg"
plan_route(deadline_hr, source_id, source_rp, target_id, target_rp)

set()
('8503000:0:33', '8503000', {'trip_id': None, 'trip_headsign': None, 'route_name': None, 'transport_type': 'walk', 'transport_subtype': 'walk', 'stop_name': 'zürich hb', 'departure_time': '16:10:00', 'next_stop_name': 'zürich hb', 'next_arrival_time': None, 'next_departure_time': None, 'is_walkable': True, 'duration_s': 120.0})
('8503006:0:2', '8503000:0:33', {'trip_id': '303.TA.91-14-C-j23-1.104.R', 'trip_headsign': 'Hinwil', 'route_name': 's14', 'transport_type': 'train', 'transport_subtype': 's', 'stop_name': 'zürich hb', 'departure_time': '16:12:00', 'next_stop_name': 'zürich oerlikon', 'next_arrival_time': '16:16:00', 'next_departure_time': '16:18:00', 'is_walkable': False, 'duration_s': 240.0})
('8503129:0:3', '8503006:0:2', {'trip_id': '303.TA.91-14-C-j23-1.104.R', 'trip_headsign': 'Hinwil', 'route_name': 's14', 'transport_type': 'train', 'transport_subtype': 's', 'stop_name': 'zürich oerlikon', 'departure_time': '16:18:00', 'next_stop_name': 'wallisellen', 'next_arrival_t

{'wallisellen',
 'wallisellen, bahnhof',
 'wallisellen, belair',
 'wallisellen, herti',
 'zürich hb',
 'zürich oerlikon',
 'zürich, auzelg'}

You can see that fundamentally this first example corresponds to the following SBB query : 
![Zürich HB -> Zürich Aulzeg](./validation_images/toy_example.png)

Our route planner algorithm had the assumptions of having a deadline hour to choose, and pick the latest arrival time to give to the user. In this example, we see that the deadline hour is 16:30:00, so our algorithm will pick the path arriving in source station at 16:12:00 :
![Path picked](./validation_images/toy_example_hb_auzelg.png)

**Start** : Zürich HB  
**End** : Kloten, Kasern OST  
**Deadline Date** : 29/05/2023  16:00:00  
**Path picking condition** : Path with the latest departure time

In [21]:
deadline_hr = "14:00:00"
source_id = "8503000"
target_id = "8573233"
source_rp = "zürich hb"
target_rp = "kloten, kaserne ost"
plan_route(deadline_hr, source_id, source_rp, target_id, target_rp)

set()
('8503000:0:41/42', '8503000', {'trip_id': None, 'trip_headsign': None, 'route_name': None, 'transport_type': 'walk', 'transport_subtype': 'walk', 'stop_name': 'zürich hb', 'departure_time': '13:17:00', 'next_stop_name': 'zürich hb', 'next_arrival_time': None, 'next_departure_time': None, 'is_walkable': True, 'duration_s': 120.0})
('8503020:0:3', '8503000:0:41/42', {'trip_id': '129.TA.91-7-C-j23-1.20.H', 'trip_headsign': 'Winterthur', 'route_name': 's7', 'transport_type': 'train', 'transport_subtype': 's', 'stop_name': 'zürich hb', 'departure_time': '13:19:00', 'next_stop_name': 'zürich hardbrücke', 'next_arrival_time': '13:21:00', 'next_departure_time': '13:21:00', 'is_walkable': False, 'duration_s': 120.0})
('8503006:0:8', '8503020:0:3', {'trip_id': '129.TA.91-7-C-j23-1.20.H', 'trip_headsign': 'Winterthur', 'route_name': 's7', 'transport_type': 'train', 'transport_subtype': 's', 'stop_name': 'zürich hardbrücke', 'departure_time': '13:21:00', 'next_stop_name': 'zürich oerlikon',

{'kloten',
 'kloten balsberg',
 'kloten, bahnhof',
 'kloten, kaserne ost',
 'kloten, kirchgasse',
 'kloten, zum wilden mann',
 'opfikon',
 'zürich hardbrücke',
 'zürich hb',
 'zürich oerlikon'}

![Staffeln AG -> Illnau, Horben](./validation_images/zurich_kloten.png)
![Staffeln AG -> Illnau, Horben](./validation_images/zurich_kloten_2.png)

In this example, we see a small difference in the end of 

## Distance based validation
We first compute the pairwise of all stops in the Zürich area, and order them by their Haversine distance which is the angular distance between two points on the surface of a sphere. As explained earlier, we are going to randomly choose several stop pairs from different distance quartiles, extreme points and compare the path outputed by our planner with the official SBB planner, with the assumptions presented earlier.

In [38]:
%%spark
import numpy as np

from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pickle

username = "ouerghem"
stops_df = spark.read.orc(f"/user/{username}/preprocessed/pp_stops")

# Constants
EARTH_RADIUS = 6371 * 1e3  # in meters

def haversine_dist(x_lat, x_lon, y_lat, y_lon, in_radian=False):
    """
    Calculate the Haversine distance between two points given their latitudes and longitudes.
    """
    # Convert to radians if needed
    if not in_radian:
        x_lon = F.radians(x_lon)
        x_lat = F.radians(x_lat)
        y_lon = F.radians(y_lon)
        y_lat = F.radians(y_lat)

    sin2_lat = F.pow(F.sin(0.5 * (y_lat - x_lat)), 2)
    sin2_lon = F.pow(F.sin(0.5 * (y_lon - x_lon)), 2)
    cos_lat_prod = F.cos(y_lat) * F.cos(x_lat)

    # Distance in meters
    dist = 2 * EARTH_RADIUS * F.asin(F.sqrt(sin2_lat + cos_lat_prod * sin2_lon))
    return dist
stops_df = stops_df.filter(F.col("stop_name") != "hausen am albis, post")
distance_df = stops_df.alias("a").crossJoin(stops_df.alias("b")) \
    .select(
        F.col("a.stop_id").alias("stop_id_a"),
        F.col("a.stop_name").alias("stop_name_a"),
        F.col("a.stop_lat").alias("stop_lat_a"),
        F.col("a.stop_lon").alias("stop_lon_a"),
        F.col("b.stop_id").alias("stop_id_b"),
        F.col("b.stop_name").alias("stop_name_b"),
        F.col("b.stop_lat").alias("stop_lat_b"),
        F.col("b.stop_lon").alias("stop_lon_b")
    ) \
    .withColumn("distance", haversine_dist(
        F.col("stop_lat_a"),
        F.col("stop_lon_a"),
        F.col("stop_lat_b"),
        F.col("stop_lon_b")
    ))

window_spec = Window.partitionBy("stop_id_a").orderBy(F.desc("distance"))
farthest_stops_df = distance_df.withColumn("rank", F.row_number().over(window_spec)) \
    .filter(F.col("rank") == 1) \
    .select(
        F.col("stop_id_a"),
        F.col("stop_name_a"),
        F.col("stop_id_b"),
        F.col("stop_name_b"),
        F.col("distance")
    ) \
    .orderBy(F.desc("distance"))

quantiles = [0.25, 0.5, 0.75]  

quantile_values = distance_df.approxQuantile("distance", quantiles, 0.25)
quantile_dataframes = []
for i, quantile in enumerate(quantiles):
    lower_bound = quantile_values[i-1] if i > 0 else float("-inf")
    upper_bound = quantile_values[i]
    quantile_df = distance_df.filter((F.col("distance") > lower_bound) & (F.col("distance") <= upper_bound))
    quantile_dataframes.append(quantile_df)
    
quantile_stop_list = []
for i, quantile in enumerate(quantiles):
    print(f"Quantile {quantile}: {quantile_values[i]}")
    quantile_stop_list.append(quantile_dataframes[i].rdd.map(tuple).take(20))
    print("----------------------")
    
farthest_stops_list = farthest_stops_df.rdd.map(tuple).take(40)
median_distance_stops = quantile_stop_list[1]
median_distance_stops

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Quantile 0.25: 0.0
----------------------
Quantile 0.5: 31648.71460268367
----------------------
Quantile 0.75: 31848.710981768763
----------------------
[('8503056', 'ringlikon', 47.3602952403085, 8.47742828687451, '8503061', 'langwies zh', 47.2960526930082, 8.69438939432526, 17844.102198839013), ('8503056', 'ringlikon', 47.3602952403085, 8.47742828687451, '8503147:0:2', 'stettbach', 47.3972125517017, 8.59614065168743, 9836.060680752431), ('8503056', 'ringlikon', 47.3602952403085, 8.47742828687451, '8503303:0:2', 'illnau', 47.4088195240113, 8.72273124154318, 19240.228583997417), ('8503056', 'ringlikon', 47.3602952403085, 8.47742828687451, '8573205:0:N', 'zürich flughafen, bahnhof', 47.4501618688915, 8.56374740253758, 11918.666352882656), ('8503056', 'ringlikon', 47.3602952403085, 8.47742828687451, '8573260', 'höri, fuhrstrasse', 47.509538772054, 8.51530125925827, 16837.83763362294), ('8503056', 'ringlikon', 47.3602952403085, 8.47742828687451, '8576131', 'volketswil, eichstrasse', 47.3

### Median distanced stops

In [39]:
median_stops = [('8503056', 'ringlikon', '8503061', 'langwies zh', 17844.102198839013), 
                ('8503056', 'ringlikon', '8503147:0:2', 'stettbach', 9836.060680752431), 
                ('8503056', 'ringlikon', '8503303:0:2', 'illnau', 19240.228583997417), 
                ('8503056', 'ringlikon', '8573205:0:N', 'zürich flughafen, bahnhof', 11918.666352882656), 
                ('8503056', 'ringlikon', '8573260', 'höri, fuhrstrasse', 16837.83763362294), 
                ('8503056', 'ringlikon', '8576131', 'volketswil, eichstrasse', 15892.450993160162)]

**Start** : Ringlikon   
**End** : Langwies ZH     
**Deadline Date** : 29/05/2023  8:30:00  
**Path picking condition** : Path with the latest departure time

In [37]:
deadline_hr = "8:30:00"
source_id, source_name, target_id, target_name, _ = median_stops[0]
plan_route(deadline_hr, source_id, source_rp, target_id, target_rp)

set()
('8503056:0:1', '8503056', {'trip_id': None, 'trip_headsign': None, 'route_name': None, 'transport_type': 'walk', 'transport_subtype': 'walk', 'stop_name': 'ringlikon', 'departure_time': '06:58:00', 'next_stop_name': 'ringlikon', 'next_arrival_time': None, 'next_departure_time': None, 'is_walkable': True, 'duration_s': 120.0})
('8503055:0:1', '8503056:0:1', {'trip_id': '74.TA.91-10-A-j23-1.7.H', 'trip_headsign': 'Zürich HB SZU', 'route_name': 's10', 'transport_type': 'bus', 'transport_subtype': 'b', 'stop_name': 'ringlikon', 'departure_time': '07:00:00', 'next_stop_name': 'uitikon waldegg', 'next_arrival_time': '07:02:00', 'next_departure_time': '07:03:00', 'is_walkable': False, 'duration_s': 120.0})
('8503054:0:1', '8503055:0:1', {'trip_id': '74.TA.91-10-A-j23-1.7.H', 'trip_headsign': 'Zürich HB SZU', 'route_name': 's10', 'transport_type': 'bus', 'transport_subtype': 'b', 'stop_name': 'uitikon waldegg', 'departure_time': '07:03:00', 'next_stop_name': 'zürich triemli', 'next_arri

{'egg',
 'forch',
 'hinteregg',
 'langwies zh',
 'neuhaus bei hinteregg',
 'ringlikon',
 'scheuren',
 'uitikon waldegg',
 'zürich binz',
 'zürich friesenberg',
 'zürich hb',
 'zürich schweighof',
 'zürich selnau',
 'zürich selnau, bahnhof',
 'zürich stadelhofen, bahnhof',
 'zürich triemli',
 'zürich, auzelg',
 'zürich, balgrist',
 'zürich, bellevue',
 'zürich, bürkliplatz',
 'zürich, hegibachplatz',
 'zürich, kantonalbank',
 'zürich, kreuzplatz',
 'zürich, paradeplatz',
 'zürich, rehalp',
 'zürich, stockerstrasse'}

**Start** :   
**End** :   
**Deadline Date** : 29/05/2023  16:00:00  
**Path picking condition** : Path with the latest departure time

In [None]:
deadline_hr = "16:00:00"
source_id, source_name, target_id, target_name, _ = median_stops[1]
plan_route(deadline_hr, source_id, source_rp, target_id, target_rp)

### Farthest distanced stops

In [18]:
farthest_stops = [('8572644', 'staffeln ag, gass', '8575929', 'illnau, horben', 31805.94865060654),
('8580840', 'staffeln ag, schulhaus', '8575929', 'illnau, horben', 31754.29007937755),
('8580846', 'bremgarten ag, kaserne', '8575929', 'illnau, horben', 31672.301820298693),
('8503783', 'neerach, post', '8590499', 'au zh, seeguet', 31665.621728303966),
('8573559', 'horgenberg, vorderklausen', '8573272', 'steinmaur, heitlig', 31659.006961176536),
('8503302:0:2', 'fehraltorf', '8572567', 'künten, gried', 31648.71460268367)]

Based on the previous cell, we found that the two stops with the largest distance separation are Staffeln AG, Gass and Illnau, Horben. We are thus going to check the following trip :  
**Start** : Staffeln AG, Gass  
**End** : Illnau, Horben  
**Deadline Date** : 29/05/2023  16:00:00  
**Path picking condition** : Path with the latest departure time

In [17]:
deadline_hr = "16:00:00"
source_id, source_name, target_id, target_name, _ = farthest_stops[0]
plan_route(deadline_hr, source_id, source_rp, target_id, target_rp)

set()
('8572645', '8572644', {'trip_id': None, 'trip_headsign': None, 'route_name': None, 'transport_type': 'walk', 'transport_subtype': 'walk', 'stop_name': 'staffeln ag, gass', 'departure_time': '13:02:26', 'next_stop_name': 'hermetschwil, kloster', 'next_arrival_time': None, 'next_departure_time': None, 'is_walkable': True, 'duration_s': 454.0})
('8572643', '8572645', {'trip_id': '55.TA.96-170-3-j23-1.4.R', 'trip_headsign': 'Bremgarten AG, Obertorplatz', 'route_name': '339', 'transport_type': 'bus', 'transport_subtype': 'b', 'stop_name': 'hermetschwil, kloster', 'departure_time': '13:10:00', 'next_stop_name': 'bremgarten ag, west', 'next_arrival_time': '13:12:00', 'next_departure_time': '13:13:00', 'is_walkable': False, 'duration_s': 120.0})
('8502272:0:1/2', '8572643', {'trip_id': None, 'trip_headsign': None, 'route_name': None, 'transport_type': 'walk', 'transport_subtype': 'walk', 'stop_name': 'bremgarten ag, west', 'departure_time': '13:42:41', 'next_stop_name': 'bremgarten west

{'bassersdorf',
 'berikon, kirche',
 'berikon, kreisschule',
 'berikon, mattenhof',
 'berikon, stalden',
 'berikon-widen',
 'berikon-widen, bahnhof',
 'bibenlos-sonnenhof',
 'birmensdorf zh',
 'birmensdorf zh, altenberg',
 'birmensdorf zh, bahnhof süd',
 'birmensdorf zh, zentrum',
 'bremgarten',
 'bremgarten ag, west',
 'bremgarten isenlauf',
 'bremgarten obertor',
 'bremgarten west',
 'effretikon',
 'effretikon, bahnhof',
 'effretikon, eselriet',
 'effretikon, lindenwiese',
 'effretikon, müselacher',
 'effretikon, wattspitz',
 'effretikon, zentrum',
 'hermetschwil, kloster',
 'illnau, bahnhof',
 'illnau, chrummenacher',
 'illnau, dorfplatz',
 'illnau, horben',
 'illnau, kirche',
 'illnau, steinacher',
 'illnau, weisslingerstrasse',
 'illnau, wingert',
 'kloten, kaserne ost',
 'oberwil-lieli, bündtenmättli',
 'oberwil-lieli, dreispitz',
 'oberwil-lieli, englisächer',
 'oberwil-lieli, im moos',
 'oberwil-lieli, lieli dorf',
 'staffeln ag, gass',
 'urdorf',
 'urdorf weihermatt',
 'widen 

In this example, we get a pretty similar path with a small difference in the start of the travel 

![Staffeln AG -> Illnau, Horben](./validation_images/staffeln_ag.png)
![Staffeln AG -> Illnau, Horben](./validation_images/staffeln_ag_2.png)