Organize events occuring along train line for MBTAdelay
Estimate which hours events are occuring at times and fill
dataframe for addition to general MBTdelay dataframe

Currently only for Green Line D Branch/Fenway Park

(C) Mark Mace 2019

In [1]:
# BASIC IMPORT
import numpy as np
import os
import matplotlib.pyplot as plt
import csv
import pandas as pd

# FOR DATES AND TIMES #
import time
import datetime
from dateutil import tz
from datetime import timedelta
import arrow
from time import strptime

# GENERAL PURPOSE DATE-TIME FUNCTION
from date_time_functions import *

In [2]:
## SPECIFIC DATE-TIME FUNCTIONS
#Apr 11 2016 to 2016-11-06
def convert_to_numdate(x):
    mon_loc=x[:3]
    day_loc=str(int(x[4:6])).zfill(2)
    yr_loc=x[len(x)-4:len(x)]
    mon_res=str(strptime(str(mon_loc),'%b').tm_mon).zfill(2)
    return yr_loc+"-"+mon_res+"-"+day_loc

# FILL IN DATE WITH ZEROS
def fixdate_fillzeros(x):
    m,d,y=x.split("/")
    m=m.zfill(2)
    d=d.zfill(2)
    return y+"-"+m+"-"+d

# CONVERT HH:MM TO HOURS
def to_hours(x):
    xh,xm=x.split(':')
    return float(xh)+float(xm)/60.0



In [3]:
# IMPORT REDSOX SCHEDULES
rs_raw_2016=pd.read_csv("EVENT_SCHED/raw_2016.csv")
rs_raw_2017=pd.read_csv("EVENT_SCHED/raw_2017.csv")
rs_raw_2018=pd.read_csv("EVENT_SCHED/raw_2018.csv")
rs_raw_2019=pd.read_csv("EVENT_SCHED/raw_2019.csv")

# PICK OUT ONLY HOME GAMES, ASSIGN YEAR, AND REDUCE DATAFRAME SIZE
rs_2016=rs_raw_2016[rs_raw_2016['At']!="@"]
rs_2016=rs_2016.drop('At',axis=1).drop('Bs',axis=1).drop('Gm',axis=1)
rs_2016['year']=' 2016'

rs_2017=rs_raw_2017[rs_raw_2017['At']!="@"]
rs_2017=rs_2017.drop('At',axis=1).drop('Bs',axis=1).drop('Gm',axis=1).drop('OrigScheduled',axis=1)
rs_2017['year']=' 2017'

rs_2018=rs_raw_2018[rs_raw_2018['At']!="@"]
rs_2018=rs_2018.drop('At',axis=1).drop('Bs',axis=1).drop('Gm',axis=1)
rs_2018['year']=' 2018'

rs_2019=rs_raw_2019[rs_raw_2019['At']!="@"]
rs_2019=rs_2019.drop('At',axis=1).drop('Bs',axis=1).drop('Gm',axis=1)
rs_2019['year']=' 2019'


In [4]:
# DETERMINE STARTING AND ENDING TIMES WHEN NOT SUPPLIED

# TIME SPAN (LOCAL TIME) ESTIMATES
rs_d_start_hour=12 # estimate for start time of day game
rs_n_start_hour=19 # estimate for start time of night game
co_n_start_hour=18 # estimate for start time of concert
co_n_end_hour=23 # estimate for end time of concert

def add_time(df):
    if df['event']==1: # REDSOX
        if df['DN']=='D':
            return np.arange(rs_d_start_hour,np.ceil(rs_d_start_hour+1+to_hours(df['length']))+1)
        elif df['DN']=='N':
            return np.arange(rs_n_start_hour,np.ceil(rs_n_start_hour+1+to_hours(df['length']))+1)
        else:
            print("NO GAME!")
            return [] # NO GAME ?
    elif df['event']==2: # CONCERT
        if df['DN']=='N':
            return np.arange(co_n_start_hour,co_n_end_hour+1)
        else: 
            print("NO DAY CONCERTS")
            return [] # NO DAY CONCERTS
            
    else:
        print("NOTHING")
        return [] # NOTHING
            

In [5]:
# CONSIDER ALL BASEBALL GAMES AND ASSIGN TIMES
all_rs=[rs_2016,rs_2017,rs_2018,rs_2019]
all_rs=pd.concat(all_rs)

# MAKE 
all_rs['fd']=all_rs['Date'].str.cat(all_rs['year']) # FULL DATE
all_rs=all_rs.drop('Date',axis=1).drop('year',axis=1) # DROP INDIVIDUAL DATE COLUMNS
all_rs['date']=all_rs['fd'].apply(lambda x: convert_to_numdate(x)) # CHANGE FORMAT
all_rs=all_rs.drop('fd',axis=1) # DROP fd DATE
all_rs=all_rs.rename(columns={'Time': 'length'}) # RENAME

all_rs['event']=int(1) # DENOTE AS BASEBALL GAME


In [6]:
# CONSIDER ALL CONCERTS AND ADD TIMES
concerts=pd.read_csv("EVENT_SCHED/concerts.csv")
concerts['date']=concerts['date'].apply(lambda x: fixdate_fillzeros(x)) # FIX THE DATE FORMAT
concerts['DN']='N' # SPECIFY THAT ALL CONCERTS ARE AT NIGHT -- MATCHING BASEBALL FORMAT
concerts['event']=int(2) # DENOTE AS A CONCERT


In [7]:
# COMBINE BASEBALL GAMES AND CONCERTS
all_events=[all_rs,concerts]
all_events=pd.concat(all_events,sort=False) # MAKE DF
all_events=all_events[['date','event','DN','length']] # SPECIFY COLUMNS
all_events=all_events.sort_values(by=['date']) # DATE SORT
all_events=all_events.reset_index(drop=True) # RE-INDEX
all_events['time']=0.0 # SET ALL FOR HAVE A DUMMY TIME AS 0


In [28]:
# ADD TIME COMPONENT -- OVERWRITING 0

# DETERMINE HOURS OF EVENTS BASED ON ESTIMATED START AND END TIMES
time_addition=all_events.apply(lambda row: add_time(row), axis=1) # A LIST OF ALL HOURS

# CAST TO LIST
all_hours=[]
for i in range(len(time_addition)):
    for j in range(len(time_addition[i])):
        all_hours.append(time_addition[i][j])

# GET LIST OF EVENT NUMBERS TO MAP EVENT HOURS TO
event_hour_map=[]
for j in range(len(all_events)):
    for i in range(len(time_addition[j])):
        event_hour_map.append(j)

# COPY DATAFRAME REPEATING ROWS
all_events_wt=all_events.iloc[event_hour_map]

# CHECK LENGHTS ARE THE SAME
if(len(all_hours)!=len(all_events_wt['time'])):
   print("!! WARNING -- DIFFERENT TIME ASSIGNMENT LENGTHS !!")

# APPLY HOURS
all_events_wt.loc[:,('time')]=all_hours


In [30]:
# FINAL EVENT SCHEDULE WITH TIMES AND DATES
# CHANGE FORMATS
all_events_wt=all_events_wt.reset_index(drop=True)
all_events_wt["period"]=all_events_wt["date"].map(str)+" "+all_events_wt["time"].astype(int).map(str)+":00"
all_events_wt["u_datetime"]=all_events_wt["period"].apply(lambda x: conv_east_to_unixts_hm(x))

header = ["u_datetime", "event","period","DN"]
all_events_wt.to_csv('EVENT_SCHED/all_events.csv',columns=header,index=False)
