### Imports

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

In [None]:
bus_stops = pd.read_csv('bus_stops_gis.csv')
bus_stops.head()
bus_stop_dedup = bus_stops.drop_duplicates(subset=['Stop_name'], keep='first', ignore_index=True)

### Get routes and remove routes without lat long

In [None]:
# Get all bus services
page = requests.get('https://bustime.mta.info/routes/')
soup = BeautifulSoup(page.content, 'html.parser')
routes = soup.find('ul', class_='routeList')
svc = [x.text for x in routes.find_all('a')]
svcs = [x.split('\xa0')[0] for x in svc]

In [None]:
all_routes = pd.DataFrame(columns = ['route_id','direction','seq_id','origin_stop_id','origin_stop_name','origin_lat','origin_lon','dest_stop_id','dest_stop_name','dest_lat','dest_lon'])

for svc in svcs:
    page = requests.get(f'https://bustime.mta.info/m/?q={svc}')
    soup = BeautifulSoup(page.content, 'html.parser')
    routes = soup.find_all('div', class_='directionForRoute')
    route_origin = [stop.text for stop in routes[0].find_all('a')]
    directions = soup.find_all('a', class_='direction-link')
    for x in range(len(routes)):
        route_origin = [stop.text for stop in routes[x].find_all('a')]

        # create temp df for route data and fill with route stops, id and direction
        route_stop_id = [stop['href'].split('q=')[1] for stop in routes[x].find_all('a')]
        route_dest = route_origin[1:]
        route_dest.append(np.NaN)
        direction = directions[x].text.split('to')[1].split('\xa0')[1]
        df_route_temp = pd.DataFrame(columns = ['route_id','direction','origin_stop_name', 'origin_stop_id'])
        df_route_temp['origin_stop_name'] = route_origin
        df_route_temp['route_id'] = svc
        df_route_temp['direction'] = direction
        df_route_temp['origin_stop_id'] = route_stop_id

        # join with bus stop df to get stop id and lat long
        df_join = df_route_temp.merge(bus_stop_dedup, left_on=['origin_stop_name'], right_on=['Stop_name'], suffixes=('_routes','_stops'), how='left')[['route_id_routes','direction','origin_stop_id','origin_stop_name','stop_lat','stop_lon']]
        df_join['origin_stop_id'] = df_route_temp['origin_stop_id'].astype('int64')
        df_join = df_join.merge(bus_stop_dedup, left_on=['origin_stop_id'], right_on=['Stop_ID'], suffixes=('_routes','_stops'), how='left')[['route_id_routes','direction','origin_stop_id','origin_stop_name','stop_lat_routes','stop_lon_routes', 'stop_lat_stops','stop_lon_stops']]
        df_join['stop_lat_routes'] = df_join['stop_lat_routes'].fillna(df_join['stop_lat_stops'])
        df_join['stop_lon_routes'] = df_join['stop_lon_routes'].fillna(df_join['stop_lon_stops'])
        df_join = df_join.drop(columns=['stop_lat_stops','stop_lon_stops'])
        df_join = df_join.rename(columns={'route_id_routes':'route_id','stop_lat_routes':'origin_lat','stop_lon_routes':'origin_lon'})
        
        # get destination stop by taking stop from row n+1
        df_join[['dest_stop_id','dest_stop_name','dest_lat','dest_lon']]=df_join[['origin_stop_id','origin_stop_name','origin_lat','origin_lon']][1:].reset_index(drop=True)
        
        # create seq_id for stops on current route
        df_join['seq_id'] = list(range(len(df_join)+1))[1:]
        
        df_join = df_join[['route_id','direction','seq_id','origin_stop_id','origin_stop_name','origin_lat','origin_lon','dest_stop_id','dest_stop_name','dest_lat','dest_lon']]
        
        # append to all_routes df
        all_routes = pd.concat([all_routes, df_join], ignore_index=True)
        
    print(f'Done for {svc}')
    
all_routes['dest_stop_id'] = all_routes['dest_stop_id'].fillna(0)
all_routes['dest_stop_id'] = all_routes['dest_stop_id'].astype('int64')

In [None]:
all_routes.to_csv('bus_routes.csv',index=False)

In [None]:
# change id dtype to int64
df = pd.read_csv('bus_routes.csv')
df['origin_stop_id'] = df['origin_stop_id'].astype('int64')
df['dest_stop_id'] = df['dest_stop_id'].fillna(0)
df['dest_stop_id'] = df['dest_stop_id'].astype('int64')
df.to_csv('bus_routes.csv', index=False)
all_routes.to_csv('bus_routes.csv',index=False)

### Get list of bus svc with direction

In [None]:
bus_svcs = pd.DataFrame(columns=['bus_svc', 'direction'])
bus_svcs['bus_svc'] = all_routes['route_id']
bus_svcs['direction'] = all_routes['direction']
bus_svcs.drop_duplicates(keep='first')
bus_svcs.to_csv('bus_svc.csv', index=False)

### Get bus stops

In [None]:
df = pd.read_csv('bus_routes.csv')
df = df[['origin_stop_id','origin_stop_name','origin_lat','origin_lon']]
df = df.drop_duplicates(keep='first').reset_index(drop=True)
df.to_csv('bus_stops.csv', index=False)