In [49]:
import json

raw_data = json.load(open("../mini_json_dataset.json", "r"))

raw_data.keys()

dict_keys(['stops', 'trips', 'vehicles', 'duties'])

In [50]:
for vehicle in raw_data["vehicles"]:
    for event in vehicle["vehicle_events"]:
        all(
            event["vehicle_event_type"] != "service_trip" and "trip_id" not in event
            for event in vehicle["vehicle_events"]
        )
        all(
            event["vehicle_event_type"] == "service_trip" and "trip_id" in event
            for event in vehicle["vehicle_events"]
        )
# Fact: only service trips have trip_ids

In [51]:
for stop in raw_data["stops"]:
    if stop["is_depot"]:
        assert all(
            trip["origin_stop_id"] != stop["stop_id"]
            and trip["destination_stop_id"] != stop["stop_id"]
            for trip in raw_data["trips"]
        )
# Fact: no service trips end or start at the depot (a depot is not a 'normal' spot and a depot simultaneously)

In [52]:
sorted_stops = sorted(raw_data["stops"], key=lambda x: (x["latitude"], x["longitude"]))
assert all(
    sorted_stops[i]["latitude"] != sorted_stops[i + 1]["latitude"]
    and sorted_stops[i]["longitude"] != sorted_stops[i + 1]["longitude"]
    for i in range(len(sorted_stops) - 1)
)
# Fact: no two stops have the same geographical location
# This doesn't help me that much in knowing whether taxis/deadheads are necessary between any two stops.
# The distance might be small and the driver might get to the next one by walking, but I'll assume that's never the case, as I haven't been asked to do much error-handling.

# Note:
Breaks of a duty can happen within the duty data structure (between vehicles, during a "split") or within the vehicle data structure (between trips, called "layover")

In [54]:
from json import dump

unittests_json = {
    "duties": [],
    "vehicles": [],
    "trips": [],
    "stops": [],
}


def filter_objects_by_id(
    object_list: dict, object_name: str, ids: list[str]
) -> list[dict]:
    """Filters the objects in object_list by their id

    Args:
    object_list: A list of objects
    object_name: The name of the object type in object_list
    ids: A list of ids

    Returns:
        A list of objects whose id is in ids
    """
    return [obj for obj in object_list if obj[f"{object_name}_id"] in ids]


def get_vehicles_in_duty(duty: dict) -> set[str]:
    return {
        e["vehicle_id"]
        for e in duty["duty_events"]
        if e["duty_event_type"] == "vehicle_event"
    }


# It's good to have a reduced test dataset for the tests to be quick to run and
# to be able to debug them from start to end in a reasonable amount of time
# when they fail. Below, I select from the entire dataset a subset that encompasses
# a complex-enough scenario (duties that have overlap in vehicles, and that have
# multiple vehicles), in order to cover most edge cases.

selected_duties: list[dict] = []

multi_vehicle_duties = [
    duty for duty in raw_data["duties"] if len(get_vehicles_in_duty(duty)) > 1
]

for multi_vehicle_duty in multi_vehicle_duties:
    vehicles_in_duty = get_vehicles_in_duty(multi_vehicle_duty)
    for other_duty in raw_data["duties"]:
        if other_duty == multi_vehicle_duty:
            continue
        vehicles_in_other_duty = get_vehicles_in_duty(other_duty)
        if vehicles_in_duty.intersection(vehicles_in_other_duty):
            selected_duties = [multi_vehicle_duty, other_duty]
            break
    if selected_duties:
        break

selected_duties.append(
    [duty for duty in raw_data["duties"] if len(get_vehicles_in_duty(duty)) == 1][0]
)

# Although I could have avoided recomputing the vehicles in each duty, this code
# is ran only once and it runs in milisseconds, optimizing here is unnecessary.
relevant_vehicles = set()
for duty in selected_duties:
    relevant_vehicles.update(get_vehicles_in_duty(duty))

unittests_json["duties"] = selected_duties
unittests_json["vehicles"] = [
    vehicle
    for vehicle in raw_data["vehicles"]
    if vehicle["vehicle_id"] in relevant_vehicles
]

relevant_trips = {
    event["trip_id"]
    for vehicle in unittests_json["vehicles"]
    for event in vehicle["vehicle_events"]
    if event["vehicle_event_type"] == "service_trip"
}

unittests_json["trips"] = filter_objects_by_id(
    raw_data["trips"], "trip", relevant_trips
)

relevant_stops = {
    stop_id
    for trip in unittests_json["trips"]
    for stop_id in [trip["origin_stop_id"], trip["destination_stop_id"]]
}
unittests_json["stops"] = filter_objects_by_id(
    raw_data["stops"], "stop", relevant_stops
)

dump(unittests_json, open("../tests/unittests_json.json", "w"), indent=2)