In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
import numpy as np

In [3]:
# Speed Restrictions Dataframe created in speed_restrictions_preprocessing.ipynb
speed_restr = pd.read_csv('speed_restrictions.csv')

# Gated Station entries per day with AM and PM rush hours created in data_processing.ipynb
gse2018 = pd.read_csv('2018_entries_agg.csv')
gse2020 = pd.read_csv('2020_entries_agg.csv')
gse2023 = pd.read_csv('2023_entries_agg.csv')
gse2020

Unnamed: 0.1,Unnamed: 0,date,stop_id,am_rush,pm_rush,full_day
0,0,2020-01-01,place-alfcl,444,722,2449
1,1,2020-01-01,place-andrw,354,576,1668
2,2,2020-01-01,place-aport,920,1200,3883
3,3,2020-01-01,place-aqucl,104,1007,2015
4,4,2020-01-01,place-armnl,99,807,1592
...,...,...,...,...,...,...
22529,22529,2020-12-31,place-welln,259,239,816
22530,22530,2020-12-31,place-wimnl,90,64,260
22531,22531,2020-12-31,place-wlsta,300,59,576
22532,22532,2020-12-31,place-wondl,378,166,935


In [4]:
# Convert date columns to Pandas DateTime Objects
speed_restr['start_date'] = pd.to_datetime(speed_restr.start_date)
speed_restr['end_date'] = pd.to_datetime(speed_restr.end_date)

gse2018['date'] = pd.to_datetime(gse2018.date)

gse2020['date'] = pd.to_datetime(gse2020.date)

gse2023['date'] = pd.to_datetime(gse2023.date)

In [5]:
# Trim 2020 GSE to create COVID GSE
gse_COVID = gse2020[(gse2020.date >= pd.to_datetime('2020-03-15')) & (gse2020.date <= pd.to_datetime('2020-07-01'))].reset_index()
gse_COVID.drop(['index', 'Unnamed: 0'], axis=1, inplace=True)
gse_COVID

Unnamed: 0,date,stop_id,am_rush,pm_rush,full_day
0,2020-03-15,place-alfcl,204,291,961
1,2020-03-15,place-andrw,325,470,1359
2,2020-03-15,place-aport,815,852,3075
3,2020-03-15,place-aqucl,82,483,1109
4,2020-03-15,place-armnl,78,380,818
...,...,...,...,...,...
6751,2020-07-01,place-welln,523,340,1330
6752,2020-07-01,place-wimnl,269,176,707
6753,2020-07-01,place-wlsta,384,130,840
6754,2020-07-01,place-wondl,905,409,2079


In [6]:
# Create Day of Week columns for each DateFrame
speed_restr['start_day_of_week'] = speed_restr.start_date.dt.weekday
speed_restr['end_day_of_week'] = speed_restr.end_date.dt.weekday

gse2018['day_of_week'] = gse2018.date.dt.weekday

gse_COVID['day_of_week'] = gse_COVID.date.dt.weekday

gse2023['day_of_week'] = gse2023.date.dt.weekday

gse_COVID

Unnamed: 0,date,stop_id,am_rush,pm_rush,full_day,day_of_week
0,2020-03-15,place-alfcl,204,291,961,6
1,2020-03-15,place-andrw,325,470,1359,6
2,2020-03-15,place-aport,815,852,3075,6
3,2020-03-15,place-aqucl,82,483,1109,6
4,2020-03-15,place-armnl,78,380,818,6
...,...,...,...,...,...,...
6751,2020-07-01,place-welln,523,340,1330,2
6752,2020-07-01,place-wimnl,269,176,707,2
6753,2020-07-01,place-wlsta,384,130,840,2
6754,2020-07-01,place-wondl,905,409,2079,2


In [7]:
gse_COVID_avg_per_wd = gse_COVID.groupby(['day_of_week', 'stop_id']).mean(numeric_only=True)
gse_COVID_avg = gse_COVID_avg_per_wd.add_suffix('_avg').reset_index()
gse_COVID_avg

Unnamed: 0,day_of_week,stop_id,am_rush_avg,pm_rush_avg,full_day_avg
0,0,place-alfcl,307.125000,248.125000,862.437500
1,0,place-andrw,358.437500,364.250000,1148.750000
2,0,place-aport,467.866667,385.533333,1427.133333
3,0,place-aqucl,61.785714,256.285714,600.142857
4,0,place-armnl,71.250000,197.562500,441.875000
...,...,...,...,...,...
443,6,place-welln,104.937500,141.875000,448.687500
444,6,place-wimnl,70.937500,56.125000,226.562500
445,6,place-wlsta,82.625000,63.125000,249.875000
446,6,place-wondl,224.285714,186.071429,748.500000


In [8]:
# Function to return average ridership per day of week between two dates for 2018 GSE (inclusive)
def avg_per_day_between(start, end, gse_df):
    timeframe = gse_df[(gse_df.date >= start) & (gse_df.date <= end)].reset_index()
    timeframe.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)
    
    return timeframe.groupby(['day_of_week', 'stop_id']).mean(numeric_only=True).add_suffix('_avg').reset_index()

avg_per_day_between('2023-05-14', '2023-09-21', gse2023)

Unnamed: 0,day_of_week,stop_id,am_rush_avg,pm_rush_avg,full_day_avg
0,0,place-alfcl,1908.263158,974.736842,3736.368421
1,0,place-andrw,1067.052632,832.105263,2821.210526
2,0,place-aport,2278.473684,2217.526316,7544.157895
3,0,place-aqucl,270.000000,1907.894737,3814.578947
4,0,place-armnl,374.631579,1487.736842,2920.473684
...,...,...,...,...,...
443,6,place-welln,367.631579,445.526316,1494.000000
444,6,place-wimnl,326.368421,305.263158,1037.052632
445,6,place-wlsta,231.411765,115.588235,579.470588
446,6,place-wondl,873.736842,809.157895,3028.105263


In [9]:
speed_restr[speed_restr.Location_Type == 'Station']

Unnamed: 0.1,Unnamed: 0,ID,Loc_GTFS_Stop_ID,Restriction_Status,Restriction_Reason,Track_Direction,Line,Branch,Track_Name,Location_Type,...,Restriction_Distance_Feet,Line_Restricted_Track_Pct,Systemwide_Restricted_Track_Pct,SR_Restriction_Distance_Span,Restriction_Path,start_date,end_date,Restriction_Length_Days,start_day_of_week,end_day_of_week
0,0,20,place-aport,Active Restriction,,WB,Blue Line,Blue Line,BL WB,Station,...,101.0,0.001533,0.000140,Multi-Segment,End,2023-10-17,2023-10-19,2 days,1,3
4,4,44,place-rugg,Active Restriction,Track,NB,Orange Line,Orange Line,OL NB,Station,...,400.0,0.003361,0.000555,Multi-Segment,Start,2023-10-17,2023-12-21,65 days,1,3
6,6,45,place-bbsta,Active Restriction,Track,NB,Orange Line,Orange Line,OL NB,Station,...,300.0,0.002521,0.000416,Multi-Segment,Start,2023-10-20,2023-12-31,72 days,4,6
8,8,46,place-bbsta,Active Restriction,Track,SB,Orange Line,Orange Line,OL SB,Station,...,301.0,0.002529,0.000418,Multi-Segment,End,2023-10-20,2023-12-20,61 days,4,2
10,10,47,place-rcmnl,Active Restriction,Track,SB,Orange Line,Orange Line,OL SB,Station,...,101.0,0.000849,0.000140,Multi-Segment,End,2023-10-20,2023-12-31,72 days,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,690,572956,place-capst,Restriction Opened,Track,IB,Red Line,Red Line - Mattapan,Mattapan Line SB,Station,...,100.0,0.000399,0.000139,Multi-Segment,Start,2023-08-31,2023-10-29,59 days,3,6
712,712,576807,place-kencl,Active Restriction,Track,EB,Green Line,Green Line - C,GL Kenmore-Cleveland Cir. EB,Station,...,170.0,0.000596,0.000236,Single Segment,Start|End,2023-09-15,2023-12-31,107 days,4,6
715,715,577270,place-knncl,Active Restriction,Track,NB,Red Line,Red Line - Braintree,RL Alewife-Braintree NB,Station,...,167.0,0.000667,0.000232,Multi-Segment,Start,2023-09-18,2023-09-28,10 days,0,3
718,718,577616,place-wimnl,Active Restriction,Track,WB,Blue Line,Blue Line,BL WB,Station,...,300.0,0.004553,0.000416,Multi-Segment,Start,2023-09-20,2023-10-19,29 days,2,3


In [10]:
stops_df = pd.read_csv('mbta_gtfs/stops.txt')

In [11]:
# Helper function to filter GSE data by one or more stop ids
# If multiple IDs, the data will be added together along rows
def filter_and_avg_by_stop_ids(stop_ids, dataframe):
    filtered_dataframe = dataframe[dataframe.stop_id.isin(stop_ids)]
    averaged_between_stations = filtered_dataframe.groupby(['day_of_week']).mean(numeric_only=True).reset_index()
    return averaged_between_stations

test_df = avg_per_day_between('2018-01-15', '2018-02-01', gse2018)
filter_and_avg_by_stop_ids(['place-aport', 'place-wimnl'], test_df)

# Helper function to get stop name from stop_id
def stop_name_from_stop_id(stop_id):
    return stops_df.stop_name[stops_df['stop_id'] == stop_id].values[0]

In [21]:
# Function to create visualization for a speed restriction.
# Plots the average GSE for 2018 and 2023 per weekday during the same timeframe of the speed restriction,
# along with the COVID average per weekday

def speed_restr_vis(restriction_id):
    restr_start = speed_restr.start_date[speed_restr.ID == int(restriction_id)].iloc[0]
    restr_end = speed_restr.end_date[speed_restr.ID == int(restriction_id)].iloc[0]
    restr_stop_id = speed_restr.Loc_GTFS_Stop_ID[speed_restr.ID == int(restriction_id)]

    restr_stop_id_list = []
    # Clean stop_ids, there may be multiple
    if len(restr_stop_id) > 1:
        for i in range(len(restr_stop_id)):
            restr_stop_id_list += restr_stop_id.iloc[i].split(' | ')
    else: restr_stop_id_list = list(restr_stop_id.iloc[0])
    restr_stop_id_list = list(set(restr_stop_id_list))
    
    # Get averages per day between the restriction start and end dates for their respective years
    peak_data = avg_per_day_between(restr_start.replace(year=2018), restr_end.replace(year=2018), gse2018)
    speed_restr_data = avg_per_day_between(restr_start.replace(year=2023), restr_end.replace(year=2023), gse2023)


    # Filter and average by stop_id(s)
    peak_data_filtered = filter_and_avg_by_stop_ids(restr_stop_id_list, peak_data)
    speed_restr_data_filtered = filter_and_avg_by_stop_ids(restr_stop_id_list, speed_restr_data)
    covid_data_filtered = filter_and_avg_by_stop_ids(restr_stop_id_list, gse_COVID_avg)
    
    # Combine into one dataframe for altair to plot
    # Add column for name of dataset
    peak_data_filtered['dataset'] = '2018 Average'
    speed_restr_data_filtered['dataset'] = 'Average During Restriction'
    covid_data_filtered['dataset'] = 'Average During COVID Lockdown'

    # Concat dataframes
    plot_df = pd.concat([peak_data_filtered, speed_restr_data_filtered, covid_data_filtered])
    plot_df.day_of_week = plot_df.day_of_week.astype(int)
    plot_df.day_of_week = plot_df.day_of_week.apply(lambda day_num: day_num + 1 if day_num < 6 else 0)

    # Format title for plots
    stop_names = [stop_name_from_stop_id(id) for id in restr_stop_id_list]
    formatted_title = f'Average Gated Station Entries for {", ".join(stop_names)} between {restr_start.month}/{restr_start.day} and {restr_end.month}/{restr_end.day}'

    # Find max value in full day
    max_ge_full_day = np.max(plot_df.full_day_avg)
    max_ge_full_day += 500
    max_ge_full_day -= max_ge_full_day % 100

    # Plot the three datasets
    chart = alt.Chart(plot_df).mark_line().encode(
        x=alt.X('day_of_week', axis=alt.Axis(labelExpr='dayAbbrevFormat(datum.value)', title='Day of the Week')),
    )
    layered = alt.vconcat(
        alt.hconcat(chart.mark_line().encode(
                y=alt.Y('am_rush_avg:Q', title='AM Rush-hr Avg Gated Entries', scale = alt.Scale(domain=[0, max_ge_full_day])),
                color='dataset:N'
            ),
            chart.mark_line().encode(
                y=alt.Y('pm_rush_avg:Q', title='PM Rush-hr Avg Gated Entries', scale = alt.Scale(domain=[0, max_ge_full_day])),
                color=alt.Color('dataset:N', title='Dataset Timeframe')
            ),
            chart.mark_line().encode(
            y=alt.Y('full_day_avg:Q', title='Full Day Avg Gated Entries', scale = alt.Scale(domain=[0, max_ge_full_day])),
            color='dataset:N',
        )),
    ).properties(
        title=formatted_title
    ).configure_title(
        align='center',
        anchor='middle'
    )
    return layered

speed_restr_vis('45')