# Data Visualization Notebook
Contains the source code for each of our 3 Python-generated visualizations.

In [1]:
import altair as alt
import geopandas as gp
import pandas as pd
import folium as fol
import matplotlib.pyplot as plt
import folium.plugins as plug
from folium.plugins import MarkerCluster
from shapely.geometry import Point, LineString, MultiLineString
from shapely import Geometry
import numpy as np
import pyproj as prj

### Gated Station Entries by Year by Time

In [2]:
# load in formatted data frame
entries_by_time = pd.read_csv('entries_by_time.csv')

# creates dropdown
input_dropdown = alt.binding_select(
    options=list(entries_by_time['station_name'].unique()),
    labels=list(entries_by_time['station_name'].unique()),
    name="Station: "
)

# filters the stations selection on the selected dropdown value
station_select = alt.selection_point(fields=['station_name'], bind=input_dropdown, value='All')

# creates a large, centered title for the chart
title = alt.TitleParams('Gated Station Entries by Year by Time Period', anchor='middle', fontSize=20)

# creates a line chart
line = alt.Chart(entries_by_time, title=title).mark_line().encode(
    x=alt.X('year:N', title='Year', axis=alt.Axis(labelAngle=0)),     # sets x to be the year
    y=alt.Y('value:Q', title='Average Daily Entries'),                # sets y to be average number of entries
    color=alt.Color('period:N', title='Time Period')                  # colors by time of day
).add_params(         # adds selection filter
    station_select
).transform_filter(
    station_select 
).properties(    # specifies size of plot
    width=700,
    height=350
)

# creates a scatter plot
scatter = alt.Chart(entries_by_time, title=title).mark_circle().encode(
    x=alt.X('year:N', title='Year', axis=alt.Axis(labelAngle=0)),     # sets x to be the year
    y=alt.Y('value:Q', title='Average Daily Entries'),                # sets y to be average number of entries
    color=alt.Color('period:N', title='Time Period')                  # colors by time of day
).add_params(         # adds selection filter
    station_select
).transform_filter(
    station_select 
).properties(    # specifies size of plot
    width=700,
    height=350
)

# combines the two into one plot
gse_by_year = line + scatter

# saves chart as gse_by_year.html
gse_by_year.save('gse_by_year.html')

### Individual Speed Restrictions Line Plots

In [3]:
# load in preprocessed data
plot_df = pd.read_csv('speed_restriction_549463.csv')

# find max number of station entries to format y-axis
max_ge_full_day = (np.max(plot_df.full_day_avg) + 500) - (np.max(plot_df.full_day_avg) + 500) % 100

# creates title for the plot
title = alt.TitleParams('Average Gated Station Entries for Copley, Boylston, Arlington between 6/14 and 12/5', anchor='middle', fontSize=16)

# creates a chart to store the three line plots
chart = alt.Chart(plot_df).mark_line().encode(
    x=alt.X('day_of_week', axis=alt.Axis(labelExpr='dayAbbrevFormat(datum.value)', title='Day of the Week')),
)

# creates three line charts and concatenates them together 
alt.vconcat(
    alt.hconcat(
                
                # plots AM rush hour data
                chart.mark_line().encode(
                    y=alt.Y('am_rush_avg:Q',            
                            title='Avg. Station Entries', 
                            scale = alt.Scale(domain=[0, max_ge_full_day])),  # scales y-axis to largest value across all three plots
                    color='dataset:N'
                ).properties(
                    title='AM Rush',
                    width=190,
                    height=190
                ),
        
                # plots PM rush hour data
                chart.mark_line().encode(
                    y=alt.Y('pm_rush_avg:Q', 
                            title='Avg. Station Entries', 
                            scale = alt.Scale(domain=[0, max_ge_full_day])),  # scales y-axis to largest value across all three plots
                    color=alt.Color('dataset:N', title='Dataset Timeframe')   
                ).properties(
                    title='PM Rush',
                    width=190,
                    height=190
                ),
        
                # plots full day rush hour data
                chart.mark_line().encode(
                    y=alt.Y('full_day_avg:Q', 
                            title='Avg. Station Entries', 
                            scale = alt.Scale(domain=[0, max_ge_full_day])),  # scales y-axis to largest value across all three plots
                    color='dataset:N'
                ).properties(
                    title='Full Day',
                    width=190,
                    height=190
                )),
).properties(
    title=title     # adds title to plot
).configure_title(
    align='center',
    anchor='middle'
)

# saves chart as speed_restr_line_plots.html
chart.save('speed_restr_line_plots.html')

### Interactive Speed Restrictions Map

#### Data Processing for Geo Data

In [8]:
def to_lat_long(x_in, y_in):
    return transformer.transform(x_in, y_in)

def find_line_from_points(point1, point2):
    lines_dist_to_pts = arc_gdf.geometry.apply(lambda line: line.distance(point1) + line.distance(point2))
    index = np.argmin(lines_dist_to_pts)
    return arc_gdf.iloc[index]['geometry']

def get_line_location_from_stop_id(stop_id:str):
    station_1_id, station_2_id = stop_id.split(' | ')
    station_1_point = get_location_from_stop_id(station_1_id)
    station_2_point = get_location_from_stop_id(station_2_id)

    line = find_line_from_points(station_1_point, station_2_point)
    station_1_dist = line.line_locate_point(station_1_point)
    station_2_dist = line.line_locate_point(station_2_point)
    distances_list = [station_1_dist, station_2_dist]
    distances_list.sort()

    # Choose midpoint of between stations
    diff = distances_list[0] - distances_list[1]
    mid_dist = distances_list[0] + diff

    return line.line_interpolate_point(mid_dist)

def get_location_from_stop_id(stop_id):
    if '|' in stop_id:
        return get_line_location_from_stop_id(stop_id)
    
    name = stop_name_from_stop_id(stop_id)
    if name.lower() not in set(node_gdf['STATION_lower'].array):
        if "'" in name: name = name.replace("'", '')
        elif 'Ave' in name: name = name[: -3]
        elif name == 'Northeastern University': name = 'northeastern'
    geometry = node_gdf.geometry[node_gdf['STATION_lower'] == name.lower()].values[0]
    return geometry

def stop_name_from_stop_id(stop_id):
    return stops_df.stop_name[stops_df['stop_id'] == stop_id].values[0]

def format_speed(str):
    speed = int(str[:-3])
    return f'{speed:02} mph'

arc_gdf = gp.read_file('geodata/MBTA_ARC.shp')

# Remove Silver Line
arc_gdf.drop(arc_gdf[arc_gdf.LINE == 'SILVER'].index, inplace=True)

node_gdf = gp.read_file('geodata/MBTA_NODE.shp')
node_gdf.drop(node_gdf[node_gdf.LINE == 'SILVER'].index, inplace=True)
node_gdf['STATION_lower'] = node_gdf.STATION.apply(str.lower)

stop_names = set(node_gdf.STATION.array)

input_prj = prj.Proj(open('geodata/MBTA_NODE.prj').read())
transformer = prj.Transformer.from_proj(input_prj, input_prj.to_latlong())

stops_df = pd.read_csv('stops.txt')

sr_df = pd.read_csv('speed_restrictions.csv')
sr_df.Restriction_Speed_MPH = sr_df.Restriction_Speed_MPH.apply(format_speed)

#### Map Creation

In [None]:
base_map = arc_gdf.explore(
    color=arc_gdf['LINE'],
    tooltip='LINE',
    popup=['LINE', 'ROUTE']
)
e = node_gdf.explore(
    m=base_map,
    color='black',
    tooltip='STATION',
    popup=['STATION', 'LINE', 'ROUTE']
)

In [20]:
speed_restriction_markers = {}

cluster_dict = {}

cluster_feature_group = fol.FeatureGroup('restriction_clusters', overlay=True)

for stop_id in set(sr_df.Loc_GTFS_Stop_ID.array):
    cluster = MarkerCluster(options={
        # 'singleMarkerMode' : True,
        'showCoverageOnHover' : False,
        'maxClusterRadius' : 160,

        }).add_to(cluster_feature_group)
    cluster_dict[stop_id] = cluster

for i in range(len(sr_df)):
    # Attributes for tags
    speed = sr_df.iloc[i]['Restriction_Speed_MPH']
    distance = sr_df.iloc[i]['Restriction_Distance_Feet']
    pct = sr_df.iloc[i]['Line_Restricted_Track_Pct']
    length = sr_df.iloc[i]['Restriction_Length_Days']
    start_date = sr_df.iloc[i]['start_date']
    end_date = sr_df.iloc[i]['end_date']
    loc_type = sr_df.iloc[i]['Location_Type']
    line = str(sr_df.iloc[i]['Line'])
    stop_id = sr_df.iloc[i]['Loc_GTFS_Stop_ID']

    if ' | ' in stop_id:
        stations = [stop_name_from_stop_id(id) for id in stop_id.split(' | ')]
        name = f'Between {stations[0]} and {stations[1]}'
    else: name = stop_name_from_stop_id(stop_id)

    # Determine Location for marker
    location_point = get_location_from_stop_id(stop_id)
    location = location_point.coords.xy[0][0], location_point.coords.xy[1][0]
    converted_long, converted_lat = to_lat_long(location[0], location[1])

    # Create Marker
    marker = fol.Marker(
        location=[converted_lat, converted_long],
        popup = fol.Popup(f'Restriction Speed: {speed}\nRestriction Distance: {distance} ft\nPercent of Line Restricted: {pct : 2.2%}\nRestriction Length: {length} ({start_date} to {end_date})'),
        tags=[speed, loc_type],
        tooltip=name,
        icon=fol.Icon(
            color=line.split()[0].lower(),
            icon='triangle-exclamation',
            prefix='fa'
            )
    )

    cluster = cluster_dict[stop_id]
    marker.add_to(base_map)

In [21]:
speeds = list(set(sr_df.Restriction_Speed_MPH.values))
loc_types = list(set(sr_df.Location_Type.values))
filters = speeds + loc_types
filters.sort()

In [22]:
cluster_feature_group.add_to(base_map)
fullscreen = plug.Fullscreen(position='bottomright').add_to(base_map)
filter = plug.TagFilterButton(filters, clear_text='Clear Filters').add_to(base_map)

In [23]:
base_map