In [89]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
BASE_URL = "https://www.ttmg.org/insidersguide/new-york-mta-bus-roster/new-york-mta-bus-roster-depot/"
html = requests.get(BASE_URL).text
soup = BeautifulSoup(html, "html5lib")

In [3]:
divisions = list()
for division in soup.findAll("span", {"class": "mw-headline"}):
    divisions.append(division.getText().strip())
print(divisions)

['Bronx Division', 'Manhattan Division', 'Queens Division', 'Brooklyn Division', 'Staten Island Division']


In [4]:
bronx_df = manhattan_df = queens_df = brooklyn_df = si_df = pd.DataFrame()
bronx, manhattan, queens, brooklyn, si = soup.findAll("table", {"style": "border-spacing: 2px; border: 3px solid white; font-size: 90%; width: 100%"})
boroughs = {"Bronx": bronx, "Manhattan": manhattan, "Queens": queens, "Brooklyn": brooklyn, "Staten Island": si}

In [5]:
import re

def save_depot_information(boroughs):
    df = pd.DataFrame(columns=["Borough", "Address", "Total buses"])
    for borough_name in boroughs:
        borough = boroughs[borough_name]
        for facility in borough.findAll("tr", {"style": "background:#000000; color:#FFFFFF", "align": "left"}):
            data = facility.find("td").getText()
            information = data.split("\n")
            name, address, total_buses = information[1], re.sub(r"\([^)]*\), ", "", information[2]), int(information[3][information[3].rfind(" ") + 1:])
            df.loc[name, ["Borough", "Address", "Total buses"]] = borough_name, address, total_buses
    df.index.name = "Depot"
    return df

In [6]:
depots_df = save_depot_information(boroughs)
depots_df.head()

Unnamed: 0_level_0,Borough,Address,Total buses
Depot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Eastchester Depot (EC),Bronx,"3320 Tillotson Av, Bronx, NY 10475",142
Gun Hill Depot (GH),Bronx,"1910 Bartow Av, Bronx, NY 10469",284
Kingsbridge Depot (KB),Bronx,"4065 10th Av, New York, NY 10034",272
West Farms Depot (WF),Bronx,"Bronx Division Headquarters, 1100 E 177th St, ...",321
Yonkers Depot (YO),Bronx,"59 Babcock Pl, Yonkers, NY 10701",84


In [7]:
depots_df.to_csv("../../data/depot_data/depots.csv")

In [8]:
DEPOT_URL = "https://sites.google.com/site/mtanewyorkcitybusroster/new-york-city-bus-roster"
depot_html = requests.get(DEPOT_URL).text
depot_soup = BeautifulSoup(depot_html, "html5lib")

In [9]:
depot_list = depot_soup.find("ul", {"role": "navigation", "class": "has-expander"})
all_urls = [a["href"] for a in depot_list.findAll("a") if a.has_attr("href")]
print(len(all_urls))

32


In [10]:
depot_urls = [url for url in all_urls if re.match(r"/site/mtanewyorkcitybusroster/new-york-city-bus-roster/.*", url)]
del depot_urls[0] # 126 Depot is closed
print(len(depot_urls))

27


In [165]:
def process_depot_urls(depot_urls):
    df = pd.DataFrame(columns=["Make", "Model", "Year", "Depot"])
    for depot_url in tqdm(depot_urls):    
        html = requests.get(f"https://sites.google.com{depot_url}").text
        soup = BeautifulSoup(html, "html5lib")
        depot_name = get_depot_name(soup)
        tables = get_tables(soup)
        for table in tables:
            for row in table.findAll("tr"):
                row_info = process_row(row)
                make_model, year, ids = row_info[0], row_info[-3], row_info[-2]
                make, model = make_model.split("\n", 1)
                make, model = re.sub(r"\s+", " ", make), re.sub(r"\s+", " ", model)
                for year, bus_id in zip(year.split("\n"), ids.split("\n")):
                    for ran in bus_id.split(","):
                        ran = re.sub(r"\s+", "", ran)
                        try:
                            nums = nums_from_range(ran)
                        except:
                            continue
                        for num in nums:
                            df.loc[num, :] = make, model, year, depot_name
    df.index.name = "ID"
    return df

def get_depot_name(soup):
    return soup.find("span", {"id": "sites-page-title"}).getText()

def get_tables(soup):
    site_main = soup.find("div", {"id": "sites-canvas-main", "class": "sites-canvas-main"})
    return site_main.findAll("table", {"border": "1", "bordercolor": "#888", "cellspacing": "0"})

def process_row(row):
    info = list()
    for col in row.findAll("td"):
        if col.has_attr("style") and re.match("text-.*", col["style"]):
            info.append(col.getText(separator="\n"))
    return info

def nums_from_range(ran):
    endpoints = ran.split("-")
    if len(endpoints) == 1:
        return [int(endpoints[0])]
    beg, end = endpoints
    return [num for num in range(int(beg), int(end) + 1)]

In [161]:
depot_url = depot_urls[14]
html = requests.get(f"https://sites.google.com{depot_url}").text
soup = BeautifulSoup(html, "html5lib")
depot_name = get_depot_name(soup)
tables = get_tables(soup)
for table in tables:
    for row in table.findAll("tr"):
        row_info = process_row(row)
        make_model, year, ids = row_info[0], row_info[-3], row_info[-2]
        make, model = make_model.split("\n", 1)
        make, model = re.sub(r"\s+", " ", make), re.sub(r"\s+", " ", model)
        for year, bus_id in zip(year.split("\n"), ids.split("\n")):
            for ran in bus_id.split(","):
                ran = re.sub(r"\s+", "", ran)
                try:
                    nums = nums_from_range(ran)
                except:
                    continue
                for num in nums:
                    print(num, depot_name, make, model, int(year))

1200 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2009
1202 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2010
1204 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2010
1206 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2010
1207 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2010
1208 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2010
1209 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2010
1210 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2010
1211 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2010
1212 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bus LFS Articulated 2010
1213 Kingsbridge Bus Depot (MaBSTOA) Volvo Group North Ameirca Nova Bu

In [166]:
buses_df = process_depot_urls(depot_urls)
buses_df.head()




  0%|          | 0/27 [00:00<?, ?it/s][A[A[A


  4%|▎         | 1/27 [00:00<00:23,  1.12it/s][A[A[A


  7%|▋         | 2/27 [00:02<00:25,  1.04s/it][A[A[A


 11%|█         | 3/27 [00:03<00:23,  1.02it/s][A[A[A


 15%|█▍        | 4/27 [00:03<00:21,  1.06it/s][A[A[A


 19%|█▊        | 5/27 [00:04<00:20,  1.05it/s][A[A[A


 22%|██▏       | 6/27 [00:05<00:19,  1.08it/s][A[A[A


 26%|██▌       | 7/27 [00:06<00:17,  1.13it/s][A[A[A


 30%|██▉       | 8/27 [00:07<00:17,  1.09it/s][A[A[A


 33%|███▎      | 9/27 [00:08<00:16,  1.10it/s][A[A[A


 37%|███▋      | 10/27 [00:09<00:15,  1.10it/s][A[A[A


 41%|████      | 11/27 [00:10<00:15,  1.06it/s][A[A[A


 44%|████▍     | 12/27 [00:11<00:13,  1.09it/s][A[A[A


 48%|████▊     | 13/27 [00:12<00:13,  1.08it/s][A[A[A


 52%|█████▏    | 14/27 [00:13<00:11,  1.12it/s][A[A[A


 56%|█████▌    | 15/27 [00:13<00:10,  1.12it/s][A[A[A


 59%|█████▉    | 16/27 [00:14<00:10,  1.08it/s][A[A[A


 63%|██████▎

Unnamed: 0_level_0,Make,Model,Year,Depot
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3516,Daimler Buses,Orion VII 2nd Generation+ Hybrid Low Floor,2006,Baisley Park Depot (MTA Bus)
3517,Daimler Buses,Orion VII 2nd Generation+ Hybrid Low Floor,2006,Baisley Park Depot (MTA Bus)
3518,Daimler Buses,Orion VII 2nd Generation+ Hybrid Low Floor,2006,Baisley Park Depot (MTA Bus)
3519,Daimler Buses,Orion VII 2nd Generation+ Hybrid Low Floor,2006,Baisley Park Depot (MTA Bus)
3520,Daimler Buses,Orion VII 2nd Generation+ Hybrid Low Floor,2006,Baisley Park Depot (MTA Bus)


In [168]:
BUSES_ID_FILENAME = "../../data/depot_data/buses_id_data.csv"
buses_df.to_csv(BUSES_ID_FILENAME)