In [279]:
import zipfile
import pandas as pd
import sys

In [228]:
GTFS_PATH = '/home/moritz/dev/uni/multi_modal_data/Helsinki/GTFS.zip'

# Read data 

In [229]:
STOPS_FILE = 'stops.txt'
ROUTES_FILE = 'routes.txt'
TRIPS_FILE = 'trips.txt'
STOP_TIMES_FILE = 'stop_times.txt'

EXPECTED_FILES = [STOPS_FILE, ROUTES_FILE, TRIPS_FILE, STOP_TIMES_FILE]

In [230]:
dfs = ()

with zipfile.ZipFile(GTFS_PATH, 'r') as zip_ref:
	contained = zip_ref.namelist()
	if not all([file in contained for file in EXPECTED_FILES]):
		raise Exception('Not all expected files are in the zip file')

	for file in EXPECTED_FILES:
		with zip_ref.open(file) as f:
			df = pd.read_csv(f)
			dfs += (df,)
	

## Preprocessing
In our algorithms we expect that routes always have the same path, that is the same sequence of stops.
In GTFS this is not given. Routes can go in opposite directions and have different stops.  
In this preprocessing step we make sure that all routes have the same path.  

After we are done the new `routes_id` will have the following format:  
`<old_route_id>_<direction>_<path_id>`


In [231]:
stops_df, routes_df, trips_df, stop_times_df = dfs

In [232]:
# split routes that go into opposite directions
trips_df["extended_route_id"] = trips_df["route_id"] + "_" + trips_df["direction_id"].astype(str)

In [233]:
trips_stop_times_df = pd.merge(trips_df, stop_times_df, on="trip_id")
# create a df that contains the route_id, trip_id and a the ordered list of stop_ids as a string
paths_df = (
    trips_stop_times_df.sort_values(["extended_route_id", "trip_id", "stop_sequence"])
    .groupby(["extended_route_id", "trip_id"])["stop_id"]
    .apply(list)
    .apply(str)
    .reset_index()
)


In [234]:
# now we go through each route and trip and assign a unique path id,
# which is then used in the new route id.
known_route_paths = {}
path_counter_per_route = {}
path_id_by_path = {}

paths_df["new_route_id"] = paths_df["extended_route_id"]

for i,row in paths_df.iterrows():
	path_counter = path_counter_per_route.get(row["extended_route_id"], 0)
	known_paths = known_route_paths.get(row["extended_route_id"], set())

	path_id = None

	path = row["stop_id"]
	if path in known_paths:
		path_id = path_id_by_path[path]
	else:
		path_id = chr(ord('A') + path_counter)
		paths_df.at[i, "new_route_id"] = row["extended_route_id"] + "_" + path_id

		known_paths.add(path)
		known_route_paths[row["extended_route_id"]] = known_paths

		path_counter_per_route[row["extended_route_id"]] = path_counter + 1

		path_id_by_path[path] = path_id

	paths_df.at[i, "new_route_id"] = row["extended_route_id"] + "_" + path_id


In [235]:
trips_df = trips_df.merge(
    paths_df.drop(columns=["stop_id"]), on=["extended_route_id", "trip_id"]
)


In [236]:
trips_df["old_route_id"] = trips_df["route_id"]
trips_df["route_id"] = trips_df["new_route_id"]

## Convert data from Dataframes into convenient data structures 

In [237]:
routes_by_stop = {} # l.12
stops_by_route = {} # atm we don't need the stop sequence
stops_by_route_dict = {} # l. 14

stop_times_by_trips_by_route = {}
stop_times_by_trip = {} # atm not needed in this form
times_by_stop_by_trip = {} # l. 24

trip_by_route = {} # trips ordered by first departure time

In [238]:
stop_times_by_trip_df = stop_times_df.groupby("trip_id").apply(lambda x: x.sort_values("stop_sequence"))[
    ["arrival_time", "departure_time", "stop_id", "stop_sequence"]
]
for (trip_id, _), data in stop_times_by_trip_df.to_dict('index').items():
    stop_times = stop_times_by_trip.get(trip_id, [])
    stop_times.append(data)
    stop_times_by_trip[trip_id] = stop_times

In [239]:
stop_id_set = set(stops_df['stop_id'])
route_id_set = set(trips_df['route_id'])
trip_id_set = set(trips_df['trip_id'])

In [240]:
trip_ids_by_route = (
    trips_df.groupby("route_id")["trip_id"]
    .apply(lambda x: x.tolist())
    .to_dict()
)


In [241]:
for route_id, trip_ids in trip_ids_by_route.items():
	stops_ordered = []
	stops = set() # we only need the ordered stops, but use the set to check for duplicates
	for trip_id in trip_ids:
		trip_stop_times = stop_times_by_trip[trip_id]

		stop_times_by_trip_dict = stop_times_by_trips_by_route.get(route_id, {})
		stop_times_by_trip_dict[trip_id] = trip_stop_times
		stop_times_by_trips_by_route[route_id] = stop_times_by_trip_dict

		for stop_time in trip_stop_times:
			stop_in_route = (stop_time['stop_id'], stop_time['stop_sequence'])
			if stop_in_route not in stops:
				stops_ordered.append(stop_in_route)
				stops.add(stop_in_route)

	stops_by_route[route_id] = stops_ordered

In [242]:
# inverse routes_by_stop
routes_by_stop = {}
for route_id, stops in stops_by_route.items():
	for (stop,_) in stops:
		routes = routes_by_stop.get(stop, [])
		routes.append(route_id)
		routes_by_stop[stop] = routes


def get_routes_serving_stop(stop_id):
	return routes_by_stop[stop_id]

In [275]:
# if we have route and stop, we fastly want to access the stop_sequence
stops_by_route_dict = {
    k: {stop: stop_seq for (stop, stop_seq) in (v)} for k, v in stops_by_route.items()
}

def get_idx_of_stop_in_route(stop_id, route_id):
    return stops_by_route_dict[route_id][stop_id]

In [289]:
def str_time_to_seconds(str_time):
    hours, minutes, seconds = map(int, time_string.split(':'))
    total_seconds = hours * 3600 + minutes * 60 + seconds
    return total_seconds

In [290]:
times_by_stop_by_trip = {
    trip_id: {
        stop["stop_id"]: (
            str_time_to_seconds(stop["arrival_time"]),
            str_time_to_seconds(stop["departure_time"]),
        )
        for stop in stops
    }
    for (trip_id, stops) in stop_times_by_trip.items()
}


def get_arrival_time(trip_id: str, stop_id: str) -> int:
    return times_by_stop_by_trip[trip_id][stop_id][0]


def get_departure_time(trip_id: str, stop_id: str) -> int:
    if trip_id is None:
        return sys.maxsize
    return times_by_stop_by_trip[trip_id][stop_id][1]


In [None]:
trip_by_route = {} # trips ordered by first departure time

# Algorithm

In [244]:
import sys

In [245]:
MAX_TRANSFERS = 5
DEFAULT_TRANSFER_TIME = 60 # seconds

start_stop_id = 1240134
end_stop_id = 1230103

In [246]:
# initialize
tau_i = {
    0: {},
}
tau_best = {}
marked_stops = set()

for stop_id in stop_id_set:
    tau_i[0][stop_id] = sys.maxsize
    tau_best[stop_id] = sys.maxsize

marked_stops.add(start_stop_id)

In [None]:
def earliest_trip(route_id: str, stop_id: str, arrival_time: int, change_time: int):
    


In [247]:
Q = {}
for k in range(1, MAX_TRANSFERS + 1):
    Q = {}

    for stop_id in marked_stops:
        for route_id in get_routes_serving_stop(stop_id):
            if route_id not in Q:
                Q[route_id] = stop_id
                continue

            # if our stop is closer to the start than the existing one, we replace it
            existing_stop_id = Q[route_id]
            idx = get_idx_of_stop_in_route(stop_id, route_id)
            existing_idx = get_idx_of_stop_in_route(existing_stop_id, route_id)
            if idx < existing_idx:
                Q[route_id] = stop_id

        marked_stops.remove(stop_id)

    for route_id, stop_id in Q.items():
        trip_id = None
        for stop_id, _ in stops_by_route[route_id]:
            arrival_time = get_arrival_time(trip_id, stop_id)
            if trip_id is not None and arrival_time < min(
                tau_best[stop_id], tau_best[end_stop_id]
            ):
                tau_i[k][stop_id] = arrival_time
                tau_best[stop_id] = arrival_time
                marked_stops.add(stop_id)

            if tau_i[k - 1][stop_id] + DEFAULT_TRANSFER_TIME < get_departure_time(
                trip_id, stop_id
            ):
                trip_id = earliest_trip(stop_id, end_stop_id, arrival_time)
