# Web scraping

This notebook scrapes TransitFeeds (or openmobilitydata) for agency feed information to find all of the gtfs-realtime info. This is necessary because their API is currently not open to enrollment. For a similar reason, some of their data is outdated, so manual curation is necessary.

The scraped data helps to manually find all the current feeds, terms of services, whether and API key is necessary, and any restrictions on access. As it turns out, most are similar, providing each of the 3 realtime feed types with their own link. Some agencies provide a custom, non gtfs-realtime, API. This could potentially be used in the future for more accurate predictions.

There are other characteristics that might be worth exploring for an agency to optimize API polling. Things like operational hours, protobuf version, extensions and incrementality might all prove to be relevant; but initially they do not seems to affect the development of a general API poller service.


In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup


In [None]:
def get_feeds(soup: BeautifulSoup, parent_index=None):
    # get feed links
    feeds = []
    has_realtime_feed = False
    for feed in soup.find_all("a", "list-group-item"):
        feed_name = next(feed.stripped_strings)
        if "deprecated" in feed.stripped_strings:
            feed_name = f"DEPRECATED - {feed_name}"
        feed_type = feed.find("span").text
        feed_url = f"https://openmobilitydata.org{feed.get('href')}"
        feed_info_url = None
        feed_download_url = None
        if feed_type == "GTFS-RealTime":
            has_realtime_feed = True
            response = requests.get(feed_url)
            soup = BeautifulSoup(response.content.decode("utf-8", errors="replace"))
            for info in soup.find_all("a", "list-group-item"):
                if "Info" in info.text:
                    feed_info_url = info.get("href")
                if "Download" in info.text:
                    feed_download_url = info.get("href")
        feeds.append(
            {
                "index": parent_index,
                "name": feed_name,
                "feed_type": feed_type,
                "url": feed_url,
                "info_url": feed_info_url,
                "download_url": feed_download_url,
            }
        )
    return has_realtime_feed, pd.DataFrame(feeds)


In [None]:
def get_providers(soup: BeautifulSoup):
    providers = {}
    for elem in soup.find_all("a", "btn-success"):
        provider_elem, location_elem, _ = elem.parent.parent.find_all("td")
        url = f"https://openmobilitydata.org{provider_elem.find('a').get('href')}"
        index = url.split("/")[-1]
        name = provider_elem.text
        # handle the case where country-wide providers have no location url
        if location_elem.find("a"):
            location_url = (
                f"https://openmobilitydata.org{location_elem.find('a').get('href')}"
            )
            location = location_elem.find("a").text
        else:
            location = location_elem.text.strip()
            location_url = None
        # get gtfs-realtime feed links
        response = requests.get(url)
        soup = BeautifulSoup(response.content.decode("utf-8", errors="replace"))
        has_realtime_feed, feeds = get_feeds(soup, index)
        reason_to_skip = "no realtime" if not has_realtime_feed else None
        providers[index] = {
            "name": name,
            "location": location,
            "location_url": location_url,
            "url": url,
            "reason_to_skip": reason_to_skip,
            "feeds": pd.DataFrame(feeds),
        }
    return pd.DataFrame.from_dict(providers, orient="index")


In [None]:
# get american realtime providers
omd_url = "https://openmobilitydata.org/l/31-united-states"

# get number of pages
response = requests.get(omd_url)
soup = BeautifulSoup(response.content.decode("utf-8", errors="replace"))
num_pages = int(soup.find("ul", "pagination").find_all("li")[-1].text)

df_providers = pd.DataFrame()

for page_num in range(1, num_pages + 1):
    response = requests.get(f"{omd_url}?p={page_num}")
    soup = BeautifulSoup(response.content.decode("utf-8", errors="replace"))
    df_providers = df_providers.append(get_providers(soup))

# get canadian realtime providers
omd_url = "https://openmobilitydata.org/l/32-canada"

# get number of pages
response = requests.get(omd_url)
soup = BeautifulSoup(response.content.decode("utf-8", errors="replace"))
num_pages = int(soup.find("ul", "pagination").find_all("li")[-1].text)

for page_num in range(1, num_pages + 1):
    response = requests.get(f"{omd_url}?p={page_num}")
    soup = BeautifulSoup(response.content.decode("utf-8", errors="replace"))
    df_providers = df_providers.append(get_providers(soup))


In [None]:
# get flattened dataframe with only realtime feeds
df_feeds = pd.concat(
    [
        fds[fds["feed_type"] == "GTFS-RealTime"]
        for fds in df_providers["feeds"]
        if not fds.empty
    ],
    ignore_index=True,
)


In [None]:
df_feeds[["index", "info_url"]].drop_duplicates()
# open each link in in web browser
# import webbrowser
# for _, (name, url) in df_feeds[["index", "info_url"]].drop_duplicates().iterrows():
#     if url and not url.endswith(".proto"):
#         webbrowser.open(url)
#     else:
#         print(f"no url for {name}")


# Manual curation

At this point, I manually went through to try to filter out agencies that didn't provide vehicle positions, or were rail-only (no traffic signals). The API for updating feed sources has been down for a couple years, so there was more hand-tailoring than I was hoping for.

This meant I needed to often just google "agency gtfs realtime" to see what I found. https://www.transit.land/feeds is another source, but after scraping transitfeeds.com, I decided it wasn't worth it to cross reference this site.


In [None]:
agency_feed_issues = {
    "arlington-transit": "only trip_update",
    "bart": "only rail",
    "napa-vine": "only_alerts",
    "nassau-inter-county-express": "only_alerts",
    "nj-transit": "no realtime",
    "nyc-ferry": "only ferry",
    "orange-county-transportation-authority": "no realtime",
    "pasadena-transit": "only trip_update",
    "people-mover": "only trip_update",
    "pierce-transit": "only oba",
    "rtc-southern-nevada": "only trip_update",
    "sound-transit": "only oba",
    "the-wave": "only alerts",
    "thousand-oaks-transit": "only alerts",
    "turlock-transit": "only alerts",
    "university-of-colorado-boulder": "only alerts",
    "university-of-colorado-boulder": "only rail",
    "university-of-colorado-boulder": "only alerts",
    "wata": "only alerts",
}
for index, issue in agency_feed_issues.items():
    df_providers.at[index, "reason_to_skip"] = issue


## Making config files

These config files are under the assumption that they will functionally be what I get from the CDF database. The eventual process might be different, so any other conditions/restrictions may need to be added later.

The following is non-exhaustive, but it was sufficient to give a good sample of files for testing


In [None]:
from dataclasses import dataclass
import json
from pathlib import Path

config_folder = Path("../tsp_gtfs_realtime/config/agencies")
config_folder.mkdir(parents=True, exist_ok=True)

In [None]:
@dataclass
class Connecticut_Transit:
    location: str = "Newington, CN, US"
    terms_of_service: str = "https://www.cttransit.com/about/developers/terms-of-use"
    max_polling_rate: float = None
    developer_resources: str = "https://www.cttransit.com/about/developers"
    additional_api: bool = True
    vehicle_positions_url: str = (
        "https://s3.amazonaws.com/cttransit-realtime-prod/vehiclepositions.pb"
    )
    trip_updates_url: str = (
        "https://s3.amazonaws.com/cttransit-realtime-prod/tripupdates.pb"
    )
    alerts_url: str = "https://s3.amazonaws.com/cttransit-realtime-prod/alerts.pb"


with open(config_folder.joinpath("connecticut_transit.json"), "w") as fp:
    json.dump(vars(Connecticut_Transit()), fp, indent=2)


In [None]:
@dataclass
class Community_Transit:
    location: str = "Everett, WA, US"
    terms_of_service: str = "https://www.communitytransit.org/OpenData"
    max_polling_rate: float = None
    developer_resources: str = "https://www.communitytransit.org/OpenData"
    vehicle_positions_url: str = (
        "https://s3.amazonaws.com/commtrans-realtime-prod/vehiclepositions.pb"
    )
    trip_updates_url: str = (
        "https://s3.amazonaws.com/commtrans-realtime-prod/tripupdates.pb"
    )
    alerts_url: str = "https://s3.amazonaws.com/commtrans-realtime-prod/alerts.pb"


with open(config_folder.joinpath("community_transit.json"), "w") as fp:
    json.dump(vars(Community_Transit()), fp, indent=2)


In [None]:
@dataclass
class Big_Blue_Bus:
    location: str = "Santa Monica, CA, US"
    terms_of_service: str = "http://gtfs.bigbluebus.com/terms-and-conditions"
    max_polling_rate: float = None
    developer_resources: str = "http://gtfs.bigbluebus.com/"
    vehicle_positions_url: str = "http://gtfs.bigbluebus.com/vehiclepositions.bin"
    trip_updates_url: str = "http://gtfs.bigbluebus.com/tripupdates.bin"
    alerts_url: str = "http://gtfs.bigbluebus.com/alerts.bin"


with open(config_folder.joinpath("big_blue_bus.json"), "w") as fp:
    json.dump(vars(Big_Blue_Bus()), fp, indent=2)


In [None]:
## This agency seems to be currently down. the given domain, tmgtfsprd.sorttrpcloud.com,
## gives a 403, and the old domain , as per transitfeeds, gives an empty protobuf
# @dataclass
# class Southwest_Ohio_Regional_Transit_Authority:
#     location: str = "Cincinnati, OH, US"
#     terms_of_service: str = "https://www.go-metro.com/about-metro/developer-data"
#     max_polling_rate: float = None
#     developer_resources: str = "https://www.go-metro.com/about-metro/developer-data"
#     additional_api: bool = True
#     vehicle_positions_url: str = "http://developer.go-metro.com/TMGTFSRealTimeWebService/vehicle/VehiclePositions.pb"
#     trip_updates_url: str = "https://tmgtfsprd.sorttrpcloud.com/TMGTFSRealTimeWebService/tripupdate/tripupdates.pb"
#     alerts_url: str = (
#         "https://tmgtfsprd.sorttrpcloud.com/TMGTFSRealTimeWebService/alert/alerts.pb"
#     )
#     trapeze_url: str = "https://tmgtfsprd.sorttrpcloud.com/TMGTFSRealTimeWebService/gtfs-realtime/trapezerealtimefeed.pb"
#     note: str = (
#         "trapeze feed seems to be a combination of each of the three entity types"
#     )


# with open(config_folder.joinpath("southwest_ohio_regional_transit_authority.json"), "w") as fp:
#     json.dump(vars(Southwest_Ohio_Regional_Transit_Authority()), fp, indent=2)


In [None]:
@dataclass
class Central_Ohio_Transit_Authority:
    location: str = "Columbus, OH, US"
    terms_of_service: str = "https://www.cota.com/data/"
    max_polling_rate: float = None
    developer_resources: str = "https://www.cota.com/data/"
    vehicle_positions_url: str = (
        "http://realtime.cota.com/TMGTFSRealTimeWebService/Vehicle/VehiclePositions.pb"
    )
    trip_updates_url: str = (
        "http://realtime.cota.com/TMGTFSRealTimeWebService/TripUpdate/TripUpdates.pb"
    )
    alerts_url: str = (
        "http://realtime.cota.com/TMGTFSRealTimeWebService/Alert/Alerts.pb"
    )


with open(config_folder.joinpath("central_ohio_transit_authority.json"), "w") as fp:
    json.dump(vars(Central_Ohio_Transit_Authority()), fp, indent=2)


In [None]:
@dataclass
class Capital_Metro:
    location: str = "Austin, TX, US"
    terms_of_service: str = "https://www.capmetro.org/metrolabs/"
    max_polling_rate: float = None
    developer_resources: str = "https://www.capmetro.org/metrolabs/"
    vehicle_positions_url: str = (
        "https://data.texas.gov/download/eiei-9rpf/application%2Foctet-stream"
    )
    trip_updates_url: str = (
        "https://data.texas.gov/download/rmk2-acnw/application%2Foctet-stream"
    )
    alerts_url: str = None


with open(config_folder.joinpath("capital_metro.json"), "w") as fp:
    json.dump(vars(Capital_Metro()), fp, indent=2)


In [None]:
@dataclass
class Capital_Area_Transportation_Authority:
    location: str = "Lansing, MI, US"
    terms_of_service: str = (
        "https://www.cata.org/Portals/0/CATAGTFSLicenseAgreement20150323.pdf"
    )
    max_polling_rate: float = 30
    developer_resources: str = (
        "https://www.cata.org/About/Doing-Business-with-CATA/Developer-Resources"
    )
    vehicle_positions_url: str = (
        f"http://developers.cata.org/gtfsrt/vehicle/vehiclepositions.pb"
    )
    trip_updates_url: str = (
        f"http://developers.cata.org/gtfsrt/tripupdate/tripupdates.pb"
    )
    alerts_url: str = f"http://developers.cata.org/gtfsrt/alert/alerts.pb"
    note: str = "This feed might no longer be updating, but it is available"


with open(config_folder.joinpath("capital_area_transportation_authority.json"), "w") as fp:
    json.dump(vars(Capital_Area_Transportation_Authority()), fp, indent=2)


In [None]:
@dataclass
class Metro_Transit_Halifax:
    location: str = "Halifax, NS, CA"
    terms_of_service: str = "https://www.halifax.ca/home/open-data/open-data-licence"
    max_polling_rate: float = None
    developer_resources: str = "https://www.halifax.ca/transportation/halifax-transit/transit-technology/general-transit-feed-gtfs"
    vehicle_positions_url: str = (
        "https://gtfs.halifax.ca/realtime/Vehicle/VehiclePositions.pb"
    )
    trip_updates_url: str = "https://gtfs.halifax.ca/realtime/TripUpdate/TripUpdates.pb"
    alerts_url: str = "https://gtfs.halifax.ca/realtime/Alert/Alerts.pb"
    trapeze_url: str = (
        "https://gtfs.halifax.ca/realtime/GTFS-RealTime/TrapezeRealTimeFeed.pb"
    )
    note: str = (
        "trapeze feed seems to be a combination of each of the three entity types"
    )


with open(config_folder.joinpath("metro_transit_halifax.json"), "w") as fp:
    json.dump(vars(Metro_Transit_Halifax()), fp, indent=2)


In [None]:
@dataclass
class Edmonton_Transit_System:
    location: str = "Edmonton, AB, CA"
    terms_of_service: str = "https://data.edmonton.ca/stories/s/City-of-Edmonton-Open-Data-Terms-of-Use/msh8-if28/"
    max_polling_rate: float = None
    developer_resources: str = "https://data.edmonton.ca/browse?tags=gtfs-realtime"
    additional_api: bool = True
    vehicle_positions_url: str = (
        "http://gtfs.edmonton.ca/TMGTFSRealTimeWebService/Vehicle/VehiclePositions.pb"
    )
    trip_updates_url: str = (
        "http://gtfs.edmonton.ca/TMGTFSRealTimeWebService/TripUpdate/TripUpdates.pb"
    )
    alerts_url: str = "http://gtfs.edmonton.ca/TMGTFSRealTimeWebService/Alert/Alerts.pb"
    trapeze_url: str = "http://gtfs.edmonton.ca/TMGTFSRealTimeWebService/GTFS-RealTime/TrapezeRealTimeFeed.pb"
    note: str = (
        "trapeze feed seems to be a combination of each of the three entity types"
    )


with open(config_folder.joinpath("edmonton_transit_system.json"), "w") as fp:
    json.dump(vars(Edmonton_Transit_System()), fp, indent=2)


In [None]:
@dataclass
class Calgary_Transit:
    location: str = "Calgary, AB, CA"
    terms_of_service: str = "https://data.calgary.ca/d/Open-Data-Terms/u45n-7awa"
    max_polling_rate: float = None
    developer_resources: str = "https://data.calgary.ca/Transportation-Transit/Calgary-Transit-Realtime-Service-Alerts-GTFS-RT/jhgn-ynqj"
    vehicle_positions_url: str = (
        "https://data.calgary.ca/download/am7c-qe3u/application%2Foctet-stream"
    )
    trip_updates_url: str = (
        "https://data.calgary.ca/download/gs4m-mdc2/application%2Foctet-stream"
    )
    alerts_url: str = (
        "https://data.calgary.ca/download/jhgn-ynqj/application%2Foctet-stream"
    )


with open(config_folder.joinpath("calgary_transit.json"), "w") as fp:
    json.dump(vars(Calgary_Transit()), fp, indent=2)


In [None]:
@dataclass
class Burlington_Transit:
    location: str = "Hamilton, ON, CA"
    terms_of_service: str = "https://www.burlington.ca/en/services-for-you/resources/initiative%20projects/open_data/termsofuse.pdf"
    max_polling_rate: float = None
    developer_resources: str = "https://navburl-burlington.opendata.arcgis.com/documents/Burlington::transit-schedule-data-gtfs/explore"
    vehicle_positions_url: str = (
        "https://opendata.burlington.ca/gtfs-rt/GTFS_VehiclePositions.pb"
    )
    trip_updates_url: str = "https://opendata.burlington.ca/gtfs-rt/GTFS_TripUpdates.pb"
    alerts_url: str = "https://opendata.burlington.ca/gtfs-rt/GTFS_ServiceAlerts.pb"


with open(config_folder.joinpath("burlington_transit.json"), "w") as fp:
    json.dump(vars(Burlington_Transit()), fp, indent=2)


In [None]:
@dataclass
class Barrie_Transit:
    location: str = "Barrie, ON, CA"
    terms_of_service: str = "https://www.barrie.ca/Living/Getting%20Around/BarrieTransit/Pages/Barrie-GTFS.aspx"
    max_polling_rate: float = None
    developer_resources: str = "http://www.myridebarrie.ca/gtfs/"
    vehicle_positions_url: str = (
        "http://www.myridebarrie.ca/gtfs/GTFS_VehiclePositions.pb"
    )
    trip_updates_url: str = "http://www.myridebarrie.ca/gtfs/GTFS_TripUpdates.pb"
    alerts_url: str = "http://www.myridebarrie.ca/gtfs/GTFS_ServiceAlerts.pb"


with open(config_folder.joinpath("barrie_transit.json"), "w") as fp:
    json.dump(vars(Barrie_Transit()), fp, indent=2)


In [None]:
@dataclass
class Duluth_Transit:
    location: str = "Duluth, MN, US"
    terms_of_service: str = (
        "https://www.duluthtransit.com/home/doing-business/developer-resources/"
    )
    max_polling_rate: float = None
    developer_resources: str = (
        "https://www.duluthtransit.com/home/doing-business/developer-resources/"
    )
    additional_api: bool = True
    vehicle_positions_url: str = (
        "https://duluthtransit.com/gtfsrt/Vehicle/VehiclePositions.pb"
    )
    trip_updates_url: str = "https://duluthtransit.com/gtfsrt/TripUpdate/TripUpdates.pb"
    alerts_url: str = "https://duluthtransit.com/gtfsrt/Alert/Alerts.pb"
    trapeze_url: str = (
        "https://duluthtransit.com/gtfsrt/GTFS-RealTime/TrapezeRealTimeFeed.pb"
    )
    note: str = (
        "trapeze feed seems to be a combination of each of the three entity types"
    )


with open(config_folder.joinpath("duluth_transit.json"), "w") as fp:
    json.dump(vars(Duluth_Transit()), fp, indent=2)


In [None]:
@dataclass
class Metro_Transit_Madison:
    location: str = "Madison, WI, US"
    terms_of_service: str = (
        "http://transitdata.cityofmadison.com/MetroTransitDataTermsOfUse.pdf"
    )
    max_polling_rate: float = None
    developer_resources: str = (
        "https://www.cityofmadison.com/metro/business/information-for-developers"
    )
    vehicle_positions_url: str = (
        "http://transitdata.cityofmadison.com/Vehicle/VehiclePositions.pb"
    )
    trip_updates_url: str = (
        "http://transitdata.cityofmadison.com/TripUpdate/TripUpdates.pb"
    )
    alerts_url: str = "http://transitdata.cityofmadison.com/Alert/Alerts.pb"


with open(config_folder.joinpath("metro_transit_madison.json"), "w") as fp:
    json.dump(vars(Metro_Transit_Madison()), fp, indent=2)


In [None]:
@dataclass
class Metro_Transit_Twin_Cities:
    location: str = "Minneapolis, MN, US"
    terms_of_service: str = ""
    max_polling_rate: float = None
    developer_resources: str = "https://svc.metrotransit.org/"
    vehicle_positions_url: str = (
        "https://svc.metrotransit.org/mtgtfs/vehiclepositions.pb"
    )
    trip_updates_url: str = "https://svc.metrotransit.org/mtgtfs/tripupdates.pb"
    alerts_url: str = "https://svc.metrotransit.org/mtgtfs/alerts.pb"


with open(config_folder.joinpath("metro_transit_twin_cities.json"), "w") as fp:
    json.dump(vars(Metro_Transit_Twin_Cities()), fp, indent=2)


In [None]:
@dataclass
class Metro_Transit:
    location: str = "Saint Louis, MO, US"
    terms_of_service: str = "https://www.metrostlouis.org/developer-resources/"
    max_polling_rate: float = None
    developer_resources: str = "https://www.metrostlouis.org/developer-resources/"
    vehicle_positions_url: str = (
        f"https://www.metrostlouis.org/RealTimeData/StlRealTimeVehicles.pb"
    )
    trip_updates_url: str = (
        f"https://www.metrostlouis.org/RealTimeData/StlRealTimeTrips.pb"
    )
    alerts_url: str = f"https://www.metrostlouis.org/RealTimeData/StlRealTimeAlerts.pb"


with open(config_folder.joinpath("metro_transit.json"), "w") as fp:
    json.dump(vars(Metro_Transit()), fp, indent=2)


In [None]:
@dataclass
class AC_Transit:
    location: str = "Oakland, CA, US"
    terms_of_service: str = "https://www.actransit.org/data-terms-and-conditions/"
    max_polling_rate: float = None
    developer_resources: str = "http://api.actransit.org/transit/Help"
    additional_api: bool = True
    API_KEY: str = "REDACTED"
    vehicle_positions_url: str = (
        f"https://api.actransit.org/transit/gtfsrt/vehicles?token={API_KEY}"
    )
    trip_updates_url: str = (
        f"https://api.actransit.org/transit/gtfsrt/vehicles?token={API_KEY}"
    )
    alerts_url: str = (
        f"https://api.actransit.org/transit/gtfsrt/vehicles?token={API_KEY}"
    )


with open(config_folder.joinpath("ac_transit.json"), "w") as fp:
    json.dump(vars(AC_Transit()), fp, indent=2)


In [None]:
@dataclass
class MTA_NYC:
    location: str = "New York, New York, US"
    terms_of_service: str = "http://web.mta.info/developers/developer-data-terms.html"
    max_polling_rate: float = None
    developer_resources: str = "http://bt.mta.info/wiki/Developers/GTFSRt"
    additional_api: bool = True
    API_KEY: str = "REDACTED"
    vehicle_positions_url: str = (
        f"http://gtfsrt.prod.obanyc.com/vehiclePositions?key={API_KEY}"
    )
    trip_updates_url: str = f"http://gtfsrt.prod.obanyc.com/tripUpdates?key={API_KEY}"
    alerts_url: str = f"http://gtfsrt.prod.obanyc.com/alerts?key={API_KEY}"
    note: str = "Get API_KEY from https://register.developer.obanyc.com/"


with open(config_folder.joinpath("mta_nyc.json"), "w") as fp:
    json.dump(vars(MTA_NYC()), fp, indent=2)


In [None]:
# @dataclass
# class NAME:
#     location: str = ""
#     terms_of_service: str = ""
#     max_polling_rate: float = None
#     developer_resources: str = ""
#     additional_api: bool = False
#     API_KEY: str = ""
#     vehicle_positions_url: str = ""
#     trip_updates_url: str = ""
#     alerts_url: str = ""
#     trapeze_url: str = ""
#     note: str = ""

# with open('config/name.json', 'w') as fp:
#     json.dump(vars(NAME()), fp, indent=2)
