# BART Pre-COVID schedule

This is a quick prototype-y script to extract information from BART's schedule.

In [1]:
import collections
import json
import os
import re
import pdfplumber
import requests

AGENCY_ID = 44
SCHEDULE_TIME = "2020-02-20"
ScheduleInfo = collections.namedtuple("ScheduleInfo", "line_id1 line_id2 weekday_url sat_url sun_url")

In [2]:
BART_LINES = [
    ScheduleInfo(
        490, 
        492,
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Weekday%20Yellow%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Saturday%20Yellow%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Sunday%20Yellow%20Line.pdf"
    ),
    ScheduleInfo(
        491, 
        495, 
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Weekday%20Green%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Saturday%20Green%20Line.pdf",
        None
    ),
    ScheduleInfo(
        493, 
        494, 
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Weekday%20Orange%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Saturday%20Orange%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Sunday%20Orange%20Line.pdf",
    ),
    ScheduleInfo(
        496, 
        497,
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Weekday%20Red%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Saturday%20Red%20Line.pdf",
        None
    ),
    ScheduleInfo(
        498, 
        499, 
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Weekday%20Blue%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Saturday%20Blue%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Sunday%20Blue%20Line.pdf"
    ),
    ScheduleInfo(
        500, 
        501,
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Weekday%20Purple%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Saturday%20Purple%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Sunday%20Purple%20Line_0.pdf"
    ),
    ScheduleInfo(
        502, 
        503, 
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Weekday%20Beige%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Saturday%20Beige%20Line.pdf",
        "https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Sunday%20Beige%20Line.pdf"
    ),
]

In [3]:
def get_schedule_info(schedule_url):
    schedule_data = {"to_route":{}, "return_route": {}}
    resp = requests.get(schedule_url)
    resp.raise_for_status()
    with open(".tmp-bart.pdf", "wb") as f:
        f.write(resp.content)
    with pdfplumber.open(".tmp-bart.pdf") as pdf:
        assert len(pdf.pages) == 2
        for count, page in enumerate(pdf.pages):
            current_route = schedule_data["to_route"] if count == 0 else schedule_data["return_route"]
            words = page.extract_words()
            table = page.find_tables()
            assert len(table) == 1
            route_name = " ".join([
                word["text"] for word in words 
                # finds the words that appear just above the table (i.e. within 75px of top)
                if word["bottom"] < table[0].bbox[1] < word["bottom"] + 75
            ]).strip()
            current_route["route_name"] = route_name
            current_route["schedule"] = collections.defaultdict(list)
            # unfortunately, I can't figure out how to get the station names
            for trip_count, trip in enumerate(table[0].extract()[1:]):
                for count, stop in enumerate(trip):
                    # handles nonexistent cells at bottom of page 1 
                    # https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Sunday%20Blue%20Line.pdf
                    if stop is None:
                        continue
                    stoptime = _get_stoptime(stop, current_route["schedule"][count])
                    if stoptime != "":
                        # combination of arrival/departure + stop sequence
                        current_route["schedule"][count].append((stoptime, trip_count + 1))
    os.remove(".tmp-bart.pdf")
    return schedule_data
            
def _get_stoptime(stop, prev_stops):
    time_parts = re.compile(r"^\s*([0-9]{1,2}):([0-9]{2})\s+(AM|PM)\s*$")   
    time_match = time_parts.search(stop)
    if not time_match:
        return stop
    *time, am_pm = time_match.groups()
    hour, minute = map(int, time)
    if am_pm == "AM":
        # convert e.g. 12:15 AM to 24-hour clock (makes if statements much cleaner)
        hour = hour if hour != 12 else 0
        if prev_stops != []:
            # stops are in list of tuples where first element is the time
            first_stop = re.search(r"([0-9]{1,2}):([0-9]{2})", prev_stops[0][0])
            if not first_stop:
                return "{}:{:02d}".format(hour, minute)
            first_hour, first_minute = map(int, first_stop.groups())
            # GTFS leaves trips after midnight on service day as > 24:00:00
            if (hour, minute) < (first_hour, first_minute):
                hour += 24
    else:
        # convert to 24-hour
        hour = hour if hour == 12 else hour + 12
    return "{}:{:02d}".format(hour, minute)
            
assert _get_stoptime("12:15 AM", []) == "0:15"
assert _get_stoptime("12:15 AM", [("03:13", 1)]) == "24:15"
assert _get_stoptime("1:15 PM", []) == "13:15"

In [4]:
ArrivalDeparture = collections.namedtuple(
    "ArrivalDeparture", 
    "arrival_time departure_time stop_sequence line_id station_id"
)

def add_schedules():
    weekday_details = {
        "agency_id": AGENCY_ID,
        "log_time": SCHEDULE_TIME,
        "schedule_type": "weekday",
        "pre_covid": True,
        "arrivals": []
    }
    sat_details = {
        "agency_id": AGENCY_ID,
        "log_time": SCHEDULE_TIME,
        "schedule_type": "saturday",
        "pre_covid": True,
        "arrivals": []
    }
    sun_details = {
        "agency_id": AGENCY_ID,
        "log_time": SCHEDULE_TIME,
        "schedule_type": "sunday",
        "pre_covid": True,
        "arrivals": []
    }
    for line in BART_LINES:
        print("Parsing {}".format(line))
        if line.weekday_url is not None:
            weekday_schedule = get_schedule_info(line.weekday_url)
            weekday_details["arrivals"] += parse_route(weekday_schedule, line.line_id1, "weekday", "to_route")
            weekday_details["arrivals"] += parse_route(weekday_schedule, line.line_id2, "weekday", "return_route")
        if line.sat_url is not None:
            sat_schedule = get_schedule_info(line.sat_url)        
            sat_details["arrivals"] += parse_route(sat_schedule, line.line_id1, "saturday", "to_route")
            sat_details["arrivals"] += parse_route(sat_schedule, line.line_id2, "saturday", "return_route")
        if line.sun_url is not None:
            sun_schedule = get_schedule_info(line.sun_url)
            sun_details["arrivals"] += parse_route(sun_schedule, line.line_id1, "sunday", "to_route")
            sun_details["arrivals"] += parse_route(sun_schedule, line.line_id2, "sunday", "return_route")
    return [weekday_details, sat_details, sun_details]

    
def parse_route(schedule_json, line_id, schedule_type="weekday", route_type="to_route"):
    results = []
    url = "http://localhost:3000/api/sf/schedules?agency={}&line={}".format(AGENCY_ID, line_id)
    resp = requests.get(url)
    resp.raise_for_status()
    route_sched = schedule_json[route_type]["schedule"]
    schedules = [
        schedule for schedule in resp.json()["schedules"]
        if schedule["schedule_type"] == schedule_type and schedule["stops"] != []
    ]
    for schedule in schedules:
        stop_names = [stop["station_name"] for stop in schedule["stops"]]
        y_or_n = input("Are these the stops for {}?: {}".format(
            schedule_json[route_type]["route_name"], 
            ", ".join(stop_names)
        ))
        if y_or_n.lower().strip() in {"n", "no"}:
            rev_vals = input("Are the stops in reverse order?")
            if rev_vals.lower().strip() in {"y", "yes"}:
                sched_keys = [k for k, _ in sorted(route_sched.items())]
                sched_vals = [v for v, _ in sorted(route_sched.items())]
                route_sched = dict(zip(reversed(sched_keys), sched_vals))
            else:
                raise ValueError("Could not find relevant stops")
        elif y_or_n.lower().strip() not in {"y", "yes"}:
            raise ValueError("You must supply a yes or no value")
        for stop_num, arrivals in route_sched.items():
            # TODO: Don't make this *this* hacky
            # This addresses change in SFO/Millbrae
            if len(schedule["stops"]) == stop_num and stop_num == 27:
                station_id = 8732
            else:
                station_id = schedule["stops"][stop_num]["station_id"]
            for arrival, stop_seq in arrivals:
                stop_data = ArrivalDeparture(arrival, arrival, stop_seq, line_id, station_id)
                results.append(stop_data._asdict())
    return results

In [5]:
data = add_schedules()
with open("../assets/data/initial/bart-precovid.json", "w") as f:
    json.dump(data, f)

Parsing ScheduleInfo(line_id1=490, line_id2=492, weekday_url='https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Weekday%20Yellow%20Line.pdf', sat_url='https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Saturday%20Yellow%20Line.pdf', sun_url='https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Sunday%20Yellow%20Line.pdf')
Are these the stops for Antioch to SFO (Yellow Line)?: Antioch, Pittsburg Center, Pittsburg / Bay Point, North Concord / Martinez, Concord, Pleasant Hill / Contra Costa Centre, Walnut Creek, Lafayette, Orinda, Rockridge, MacArthur, 19th Street Oakland, 12th Street / Oakland City Center, West Oakland, Embarcadero, Montgomery Street, Powell Street, Civic Center / UN Plaza, 16th Street / Mission, 24th Street / Mission, Glen Park, Balboa Park, Daly City, Colma, South San Francisco, San B

Parsing ScheduleInfo(line_id1=496, line_id2=497, weekday_url='https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Weekday%20Red%20Line.pdf', sat_url='https://www.bart.gov/sites/default/files/docs/February%202020%20BART%20Public%20Timetable%202-20-20%20Formatted%20Saturday%20Red%20Line.pdf', sun_url=None)
Are these the stops for Richmond to Millbrae (Red Line)?: Richmond, El Cerrito Del Norte, El Cerrito Plaza, North Berkeley, Downtown Berkeley, Ashby, MacArthur, 19th Street Oakland, 12th Street / Oakland City Center, West Oakland, Embarcadero, Montgomery Street, Powell Street, Civic Center / UN Plaza, 16th Street / Mission, 24th Street / Mission, Glen Park, Balboa Park, Daly City, Colma, South San Francisco, San Bruno, Millbraeyes
Are these the stops for Millbrae to Richmond (Red Line)?: Millbrae, San Bruno, South San Francisco, Colma, Daly City, Balboa Park, Glen Park, 24th Street / Mission, 16th Street / Mission, Civic Cen