In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col 
import networkx as nx
import pandas as pd
from itertools import islice
import itertools
import plotly.express as px
import plotly.graph_objects as go
import ipywidgets as widgets
import pandas as pd
from path_utils import *
from time_utils import *
from probability_computing import *
from validation import *

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) 
spark = SparkSession.builder.appName("Router").master("local").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
import os
username=os.getenv('USER', 'anonymous')
hadoop_fs=os.getenv('HADOOP_DEFAULT_FS', 'hdfs://iccluster067.iccluster.epfl.ch:8020')
print(f"local username={username}\nhadoop_fs={hadoop_fs}")

local username=ahominal
hadoop_fs=hdfs://iccluster067.iccluster.epfl.ch:8020


## Algorithm

### Data Loading

- Load everything to PandasDF in local (to diplay on graphs)

In [3]:
#Graph data
edge_path = f"/user/{username}/graph/all_edges"
all_edge = spark.read.orc(edge_path)
df_all_edge = all_edge.toPandas()

#Transform into seconds 
df_all_edge["start_time"]=df_all_edge["start_time"].apply(lambda x: None if x is None else get_sec(x))
df_all_edge["end_time"]=df_all_edge["end_time"].apply(lambda x: None if x is None else get_sec(x))
df_all_edge = df_all_edge[~df_all_edge["expected_travel_time"].isnull()]

node_path = f"/user/{username}/graph/nodes_area"
all_nodes = spark.read.orc(node_path)
df_all_nodes = all_nodes.toPandas()

24/06/04 19:41:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/04 19:41:09 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
                                                                                

In [4]:
df_all_edge.loc[0]

expected_travel_time                    60.0
start_stop_id                        8592050
start_time                           59580.0
trip_id                 1.TA.91-m2-j24-1.1.H
end_stop_id                          8591818
end_time                             59640.0
is_walking                                 0
Name: 0, dtype: object

In [5]:
#Identify negative expected_travel_time
neg_mask = df_all_edge['expected_travel_time'] < 0
# Make the expected_travel_time positive
df_all_edge.loc[neg_mask, 'expected_travel_time'] *= -1
df_all_edge.loc[neg_mask, ['start_time', 'end_time']] = df_all_edge.loc[neg_mask, ['end_time', 'start_time']].values

In [6]:
#Delay data
all_delays_path = f"/user/{username}/delay/all_delays"
all_delays = spark.read.orc(all_delays_path)
total_delays = all_delays.count()

avg_delay_path = f"/user/{username}/delay/avg_delay"
avg_delay = spark.read.orc(avg_delay_path)
df_avg_delay = avg_delay.toPandas()

                                                                                

In [7]:
df_avg_delay.loc[0]

stop_id         8592111
hour                 22
avg_delay      59.30157
std_delay    104.179788
Name: 0, dtype: object

## Results visualisation

#### First create basic component widgets

In [8]:
#Start graph visualisation
from IPython.display import display
from ipywidgets import widgets
import time_utils
import path_utils
import plotly.express as px
import plotly.graph_objects as go
import ipywidgets as widgets
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col 

In [9]:
#Define briefly the graph
style = {'description_width': 'initial'}
initial_pos = {'description_width':'initial'}

#Define stops
stop_names = sorted(df_all_nodes['stop_name'].unique())

start = widgets.Dropdown(options=stop_names, description='From:')
start.value = stop_names[0]

end = widgets.Dropdown(options=stop_names, description='To:')
end.value = stop_names[-1]

#Define run and map buttons
button = widgets.Button(description="Run")
data_output = widgets.Output()
map_output = widgets.Output()

# Define inputs
hours = widgets.BoundedFloatText(min=0, max=23, value=12, step=1, description='Hour:', style=initial_pos)
minutes = widgets.BoundedFloatText(min=0, max=59, value=0, step=1, description='Minute:', style=initial_pos)
number_routes = widgets.BoundedIntText(min=0, max=7, value=3, step=1, description='Number of paths to show (0-6):', style=initial_pos)
max_trip_len = widgets.BoundedIntText(min=1, max=3, value=2, step=1, description='Max duration (1-3 h):', style=initial_pos)
interval = widgets.BoundedFloatText(min=0, max=1, value=0.5, step=0.05, description='Confidence Interval :', style=initial_pos)

use_validation = widgets.Checkbox(value=False, description='Show validation',style=style)

In [25]:
def create_map(path, stop_name):
    """
    Creates a map visualization of a given path and stop name.

    Parameters:
    path (pandas.DataFrame): DataFrame containing the path information, including start and end stop names, latitudes, and longitudes.
    stop_name (str): The name of the stop to be used as the center of the map.

    Returns:
    go.Figure: A Plotly Figure object representing the map visualization.
    """

    # Center given the initial stop
    center_lat = df_all_nodes[df_all_nodes['stop_name'] == stop_name].iloc[0]['stop_lat']
    center_lon = df_all_nodes[df_all_nodes['stop_name'] == stop_name].iloc[0]['stop_lon']

    # Get relevant data
    starts = path[['start_stop_name', 'start_lat', 'start_lon']]
    ends = path[['end_stop_name', 'end_lat', 'end_lon']]
    starts.columns = ['stop_name', 'stop_lat', 'stop_lon']
    ends.columns = ['stop_name', 'stop_lat', 'stop_lon']
    map = pd.concat([starts, ends])
    fig = go.Figure()

    # Add lines to the plot
    for _, row in path.iterrows():
        fig.add_trace(
            go.Scattermapbox(mode="lines", lon=[row['start_lon'], row['end_lon']], lat=[row['start_lat'], row['end_lat']], 
                marker={'size': 8}, text='Walking' if row['walking'] else 'Transport', hoverinfo='text'))
    # Add stops 
    fig.add_trace(
        go.Scattermapbox(lat=map["stop_lat"], lon=map["stop_lon"], mode='markers', 
                         marker=dict(size=10, color='blue'), text=map["stop_name"], hoverinfo='text'))
    
    fig.update_layout(
        mapbox_style="open-street-map",
        hovermode='closest',
        mapbox=dict(bearing=0, center=dict(lat=center_lat, lon=center_lon), pitch=0, zoom=11),
        showlegend=False
    )
    
    return fig
    
def print_map(path, center_name):
    with map_output:
        map_output.clear_output()
        display(create_map(path, center_name))

def print_data(path):
    with data_output:
        data_output.clear_output()
        display(path)

#### Define here function to create the map given a path and the path in itself

In [26]:
def output_path_data(df, df_all_nodes):
    """
    Merge the start and stop information from the given dataframes.

    Args:
        df (pandas.DataFrame): The dataframe containing the start and end stop IDs.
        df_all_nodes (pandas.DataFrame): The dataframe containing all the stop information.

    Returns:
        pandas.DataFrame: The merged dataframe with start and end stop information.

    """
    # Merge the start stop information
    start_stop_info = df.merge(
        df_all_nodes[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']],
        left_on='start_stop_id',
        right_on='stop_id',
        how='left'
    ).rename(columns={'stop_name': 'start_stop_name', 'stop_lat': 'start_lat', 'stop_lon': 'start_lon'}).drop(columns=['stop_id'])
    full_info = start_stop_info.merge(
        df_all_nodes[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']],
        left_on='end_stop_id',
        right_on='stop_id',
        how='left'
    ).rename(columns={'stop_name': 'end_stop_name', 'stop_lat': 'end_lat', 'stop_lon': 'end_lon'}).drop(columns=['stop_id'])
    full_info['walking'] = full_info['trip_id'] == 'None'
    
    return full_info

In [27]:

def calculate(button):
    """
    Calculate the paths and display the results on the output widgets.

    Parameters:
    - button: The button widget that triggers the calculation.

    Returns:
    - None
    """

    # Retrieve inputs
    time = f"{int(hours.value):02d}:{int(minutes.value):02d}:00" 
    max_length = int(max_trip_len.value) * 3600
    target = df_all_nodes[df_all_nodes['stop_name'] == end.value]['stop_id'].values[0]
    source = df_all_nodes[df_all_nodes['stop_name'] == start.value]['stop_id'].values[0]
    use_validate = use_validation.value
    confidence_interval = interval.value

    # Calculate the paths and the delays corresponding to each delay
    paths = get_best_paths(df_all_edge, source, target, time, max_length, int(number_routes.value), confidence_interval, df_avg_delay)
    paths.sort(key=lambda path: path[0]["start_time"], reverse=True)
    all_paths = [pd.DataFrame(path) for path in paths]
    paths_proba = [calculate_connection_probability(path, df_avg_delay) for path in paths]
    paths_validate = [historic_frequency(path, all_delays, total_delays) for path in paths] if use_validate else []
    print(paths_validate)
    # No path = return directly
    if len(all_paths) == 0:
        with data_output:
            data_output.clear_output()
            display(widgets.Label("No paths found."))
        with map_output:
            map_output.clear_output()
            display(widgets.Label("No paths found."))
        return
    
    # Create tab widgets for displaying maps and tables
    data_tab = widgets.Tab()
    map_tab = widgets.Tab()

    # Populate the tabs with outputs
    for i, path in enumerate(all_paths):
        # Output probabilities
        path_cleaned = output_path_data(path, df_all_nodes)
        proba_exp = f"Probability found (exponential): {int(paths_proba[i][0] * 100)}%"
        proba_norm = f"Probability found (normal): {int(paths_proba[i][1] * 100)}%"
        proba_validate = f" and in historical data, {int(paths_validate[i] * 100)}% of paths were successful." if use_validate else ""
        proba_output = widgets.Label(f"{proba_exp}, {proba_validate}")
        proba_norm_output = widgets.Label(f"{proba_norm}")

        # Data output
        data_output_widget = widgets.Output()
        path_cleaned_copy = path_cleaned.copy()
        with data_output_widget:
            display(proba_output)
            display(proba_norm_output)
            path_cleaned = path_cleaned[['start_stop_name', 'end_stop_name', 'start_time', 'end_time', 'walking']]
            path_cleaned.columns = ['Start Stop', 'End Stop', 'Start Time', 'End Time', 'Is Walking']
            display(path_cleaned)
        data_tab.children += (data_output_widget,)
        data_tab.set_title(i, f'Path {i + 1}')

        # Map output
        map_output_widget = widgets.Output()
        with map_output_widget:
            display(create_map(path_cleaned_copy, start.value))
        map_tab.children += (map_output_widget,)
        map_tab.set_title(i, f'Map {i + 1}')
    
    # Display the tabs
    with data_output:
        data_output.clear_output()
        display(data_tab)
    with map_output:
        map_output.clear_output()
        display(map_tab)

button.on_click(calculate)

In [28]:
#Display nicely everything
#Create the input widgets box with some padding, margin, and background color
input_widgets = widgets.HBox([hours, minutes],
                             layout=widgets.Layout(margin='10px 0', padding='10px', border='solid 1px gray', border_radius='5px', background_color='#f0f0f0', justify_content='center'))

input_widgets2 = widgets.HBox([max_trip_len, interval],
                             layout=widgets.Layout(margin='10px 0', padding='10px', border='solid 1px gray', border_radius='5px', background_color='#f0f0f0', justify_content='center'))

nodes_widgets = widgets.HBox([start, end],
                             layout=widgets.Layout(margin='10px 0', padding='10px', border='solid 1px gray', border_radius='5px', background_color='#f0f0f0', justify_content='center'))

custom_widgets = widgets.HBox([number_routes, use_validation],
                             layout=widgets.Layout(margin='10px 0', padding='10px', border='solid 1px gray', border_radius='5px', background_color='#f0f0f0', justify_content='center'))

# Group input, nodes, and custom widgets into one vertical box with a button below
all_widgets = widgets.VBox([input_widgets, input_widgets2, nodes_widgets, custom_widgets, button],
                           layout=widgets.Layout(margin='20px 0', padding='10px', border='solid 1px gray', border_radius='5px', background_color='#e0e0e0', align_items='center'))

tab = widgets.Tab([map_output, data_output])
tab.set_title(0, 'Map')
tab.set_title(1, 'Planning')

# Organize the entire dashboard
dashboard = widgets.VBox([all_widgets, tab],
                         layout=widgets.Layout(border='solid 2px black', border_radius='10px', align_items='center', background_color='#d0d0d0'))

# Display the dashboard
display(dashboard)

VBox(children=(VBox(children=(HBox(children=(BoundedFloatText(value=12.0, description='Hour:', max=23.0, step=…