In [66]:
import json

raw_data = json.load(open("../mini_json_dataset.json", "r"))

raw_data.keys()

dict_keys(['stops', 'trips', 'vehicles', 'duties'])

# Fact: only service trips have trip_ids


In [67]:
for vehicle in raw_data["vehicles"]:
    for event in vehicle["vehicle_events"]:
        all(
            event["vehicle_event_type"] != "service_trip" and "trip_id" not in event
            for event in vehicle["vehicle_events"]
        )
        all(
            event["vehicle_event_type"] == "service_trip" and "trip_id" in event
            for event in vehicle["vehicle_events"]
        )

# Fact: no service trips end or start at the depot (a depot is not a 'normal' spot and a depot simultaneously)

In [68]:
for stop in raw_data["stops"]:
    if stop["is_depot"]:
        assert all(
            trip["origin_stop_id"] != stop["stop_id"]
            and trip["destination_stop_id"] != stop["stop_id"]
            for trip in raw_data["trips"]
        )

# Fact: no two stops have the same geographical location
### This doesn't help me that much in knowing whether taxis/deadheads are necessary between any two stops.
### The distance might be small and the driver might get to the next one by walking, but I'll assume that's never the case, as I haven't been asked to do much error-handling.

In [69]:
sorted_stops = sorted(raw_data["stops"], key=lambda x: (x["latitude"], x["longitude"]))
assert all(
    sorted_stops[i]["latitude"] != sorted_stops[i + 1]["latitude"]
    and sorted_stops[i]["longitude"] != sorted_stops[i + 1]["longitude"]
    for i in range(len(sorted_stops) - 1)
)

# Note:
Breaks of a duty can happen within the duty data structure (between vehicles, during a "split") or within the vehicle data structure (between trips, called "layover")

In [79]:
from json import dump

unittests_json = {
    "duties": [],
    "vehicles": [],
    "trips": [],
    "stops": [],
}


def filter_objects_by_id(
    object_list: dict, object_name: str, ids: list[str]
) -> list[dict]:
    """Filters the objects in object_list by their id

    Args:
    object_list: A list of objects
    object_name: The name of the object type in object_list
    ids: A list of ids

    Returns:
        A list of objects whose id is in ids
    """
    return [obj for obj in object_list if obj[f"{object_name}_id"] in ids]


def get_vehicles_in_duty(duty: dict) -> set[str]:
    return {
        e["vehicle_id"]
        for e in duty["duty_events"]
        if e["duty_event_type"] == "vehicle_event"
    }


# It's good to have a reduced test dataset for the tests to be quick to run and
# to be able to debug them from start to end in a reasonable amount of time
# when they fail. Below, I select from the entire dataset a subset that encompasses
# a complex-enough scenario (duties that have overlap in vehicles, and that have
# multiple vehicles), in order to cover most edge cases.

selected_duties: list[dict] = []

multi_vehicle_duties = [
    duty for duty in raw_data["duties"] if len(get_vehicles_in_duty(duty)) > 1
]

for multi_vehicle_duty in multi_vehicle_duties:
    vehicles_in_duty = get_vehicles_in_duty(multi_vehicle_duty)
    for other_duty in raw_data["duties"]:
        if other_duty == multi_vehicle_duty:
            continue
        vehicles_in_other_duty = get_vehicles_in_duty(other_duty)
        if vehicles_in_duty.intersection(vehicles_in_other_duty):
            selected_duties = [multi_vehicle_duty, other_duty]
            break
    if selected_duties:
        break

selected_duties.append(
    [duty for duty in raw_data["duties"] if len(get_vehicles_in_duty(duty)) == 1][0]
)

# Although I could have avoided recomputing the vehicles in each duty, this code
# is ran only once and it runs in milisseconds, optimizing here is unnecessary.
relevant_vehicles = set()
for duty in selected_duties:
    relevant_vehicles.update(get_vehicles_in_duty(duty))

unittests_json["duties"] = selected_duties

relevant_stops = {
    event["origin_stop_id"]
    for duty in unittests_json["duties"]
    for event in duty["duty_events"]
    if event["duty_event_type"] != "vehicle_event"
}

unittests_json["vehicles"] = [
    vehicle
    for vehicle in raw_data["vehicles"]
    if vehicle["vehicle_id"] in relevant_vehicles
]

relevant_trips = {
    event["trip_id"]
    for vehicle in unittests_json["vehicles"]
    for event in vehicle["vehicle_events"]
    if event["vehicle_event_type"] == "service_trip"
}

relevant_stops.update(
    {
        event["origin_stop_id"]
        for vehicle in unittests_json["vehicles"]
        for event in vehicle["vehicle_events"]
        if event["vehicle_event_type"] != "service_trip"
    }
)

relevant_stops.update(
    {
        event["destination_stop_id"]
        for vehicle in unittests_json["vehicles"]
        for event in vehicle["vehicle_events"]
        if event["vehicle_event_type"] != "service_trip"
    }
)

unittests_json["trips"] = filter_objects_by_id(
    raw_data["trips"], "trip", relevant_trips
)

relevant_stops.update(
    {
        stop_id
        for trip in unittests_json["trips"]
        for stop_id in [trip["origin_stop_id"], trip["destination_stop_id"]]
    }
)


unittests_json["stops"] = filter_objects_by_id(
    raw_data["stops"], "stop", relevant_stops
)

dump(unittests_json, open("../tests/unittests_json.json", "w"), indent=2)

# Fact: the only types of duty events are sign_on, sign_off, and vehicle_event. 
## Layovers and "splits" are not represented in the data structures, but inferred from the data.


In [71]:
event_types = set()
for duty in raw_data["duties"]:
    for event in duty["duty_events"]:
        event_types.add(event["duty_event_type"])
event_types

{'sign_on', 'taxi', 'vehicle_event'}

# Fact: the only types of vehicle events are attendance, deadhead, depot_pull_in, depot_pull_out, pre_trip, service_trip. 
## Layovers are not represented in the data structures, but inferred from the data. As splits happen between vehicles, they aren't a vahicle event type.


In [72]:
event_types = set()
for duty in raw_data["vehicles"]:
    for event in duty["vehicle_events"]:
        event_types.add(event["vehicle_event_type"])
event_types

{'attendance',
 'deadhead',
 'depot_pull_in',
 'depot_pull_out',
 'pre_trip',
 'service_trip'}

# Fact: two duties don't ever claim the same vehicle event

In [73]:
vehicle_events_list = []
for duty in raw_data["duties"]:
    for event in duty["duty_events"]:
        if event["duty_event_type"] == "vehicle_event":
            vehicle_events_list.append(
                (event["vehicle_event_sequence"], event["vehicle_id"])
            )
len(vehicle_events_list) == len(set(vehicle_events_list))

True

In [None]:
# Fact: within a duty, vehicle events from the same vehicle are sequential and don't ever skip a number (useful for easily detecting breaks)

In [83]:
for duty in raw_data["duties"]:
    vehicles = set()
    for event in duty["duty_events"]:
        if event["duty_event_type"] == "vehicle_event":
            vehicles.add(event["vehicle_id"])

    for vehicle in vehicles:
        vehicle_events = [
            event["vehicle_event_sequence"]
            for event in duty["duty_events"]
            if event["duty_event_type"] == "vehicle_event"
            and event["vehicle_id"] == vehicle
        ]
        assert vehicle_events == list(
            range(min(vehicle_events), max(vehicle_events) + 1)
        )