In [262]:
# imports a library 'pandas', names it as 'pd'
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
import datetime
from bisect import bisect
import pickle
import csv
filename = '/home/williamcottrell72/github/sf18_ds11/class_lectures/week01-benson/02-git_viz/turnstiles.pkl'
from IPython.display import Image

# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline


# fix_time() is for formatting the week_nums appropriately.  

def fix_time(num):
    if len(str(num)) == 2:
        return str(num)
    else:
        return '0'+str(num)
    
# User inputs a month number (6=June, etc) and the number of years back.  The get_weeks_nums()
#function will return a list of week numbers formatted in such a way that they can be input
#into the mta url for web-scraping.

def get_week_nums(month,yrs_back):
    week_list=[]
    ref_date=datetime.date(2018,6,30)
    weeks_back=yrs_back*52
    for i in range(weeks_back):
        week_shift=datetime.timedelta(-7*i)
        new=ref_date+week_shift
        yr=str(new.year)[-2:]
        mt=fix_time(new.month)
        day=fix_time(new.day)
        string=yr+mt+day
        if int(mt)==month:
            week_list.append(int(string))
    return week_list

# scrape_pkl scrapes the data, returns a dataframe, and saves the result in a file.

def scrape_pkl(week_nums,filename):
    try:
        with open(filename,'rb') as pklfile:
            df = pickle.load(pklfile)
    except:

        df = scrape(week_nums)

        with open(filename,'wb') as pklfile:
            df = pickle.dump(df, pklfile)
    return df

#
#def fn(row):
#    return (row['datetime'].second==0)&(row['datetime'].minute==0)

"""
Cleaning proceeds as follows:  
1) Strip whitespace in column names 
2) add datetime column,
3) drop New Jersey (PTH), drop duplicates, 
4) group by station and datetime, take the sum over exits.
"""

def clean_df(df):
    cols={x:x.strip() for x in df.columns}
    df_small=df.rename(columns=cols)
    df_small['datetime']=pd.to_datetime(df_small['DATE']+' '+df_small['TIME'],infer_datetime_format=True)
    df_small_clean=df_small[df_small.DIVISION!='PTH'].drop_duplicates(subset=['C/A','UNIT','SCP','STATION','LINENAME','datetime'])
    df_small_clean2=df_small_clean.groupby(['STATION','datetime'],as_index=False)[['EXITS']].sum()
    return df_small_clean2

"""
station_activity(df,station) returns a dataframe for the specified station.  Columns are 
sorted in order of datetime.
"""

def station_activity(df,station):
    df_station=df[df['STATION']==station]
    df_sort=df_station.sort_values(by=['datetime'])
    return df_sort

"""
Next we make a dictionary whose keys are stations and values are a list of number of people
exiting per four hour period.
"""

def construct_dct(df):
    stations=df['STATION'].unique()
    station_diffs_dct={}
    for st in stations:
        sa=station_activity(df,st)
        sa['diffs']=pd.DataFrame(sa['EXITS'].diff())
        st_diffs=sa.drop(['EXITS'],axis=1).dropna()
        st_diffs_clean=st_diffs[np.abs(st_diffs.diffs)<30*10**3]    
        st_diffs_clean['weekday']=st_diffs_clean['datetime'].dt.weekday
        st_diffs_clean['hour']=st_diffs_clean['datetime'].dt.hour
        st_diffs_clean2=st_diffs_clean.groupby(['weekday','hour'])['diffs'].mean()
        station_diffs_dct[st]=st_diffs_clean2
    return station_diffs_dct

"""
Since different stations are on different schedules we need a helper function, 
find_key(), which allows us to find the appropriate time-interval to associate 
with a user-entered hour.
"""



def find_key(num,hours_list):
    sort=sorted(hours_list)
    pos=bisect(sort,num)
    if pos < len(sort):
        return sort[pos]
    else:
        return sort[0]
    
"""
Now, for each day of the week and hour, we construct a list of stations in descending 
order of the number of people leaving.
"""    
    
    
def activity_by_time(day,hour,dct):
    stations=list(set(dct.keys()))
    exits=[]
    for st in stations:
        try:
            """Below we are assuming that the hour list appearing is consistent for
            a given station. (I take the first element of the list as being represent-
            ative.) Should check this"""
            hours_st=dct[st][0].keys().values
            sh=sorted(hours_st)
            h_key=find_key(hour,sh)
            leaving=dct[st][day][h_key]/4
            """
            Below I apply np.abs to account for the possibility that the turnstile was
            reversed for some period of time.
            """             
            exits.append([st,np.abs(leaving)])
        except(KeyError,IndexError,AttributeError):
            pass
    sort_exits=sorted(exits,key=lambda x: x[1])[::-1]    
    return sort_exits


def main(month,day,hour,yrs_back=3):
    week_nums=get_week_nums(month,yrs_back)
    df=scrape_pkl(week_nums,filename)
    df_c=clean_df(df)
    dct=construct_dct(df_c)
    
    return (activity_by_time(day,hour,dct),dct,df)

In the example below we run 'main(6,4,5)' which gives us the activity for month 6 = June, day 4 = Friday, hour 5 = 5am.  The 'yrs_back' variable is set to 3 by default.  We also return the full dataframe from the last 3 years (df) and the cleaned dictionary with station data.

In [263]:
activity,dct,df = main(6,4,5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


So far we have constructed a function which shows the top stations for a given hour of the week.  We might also want to know the stations with lots of people exiting on average.  We can use the dct and df output from the previous function and take averages. We will focus on afternoon averages for specificity.

In [270]:
def make_big_df(dct):
    df_cum=pd.DataFrame(activity_by_time(0,12,dct)[:20])
    for i in range(1,12):
        for day in range(1,7):
            df2=pd.DataFrame(activity_by_time(day,12+i,dct)[:20])
            df_cum=pd.merge(df_cum,df2,on=0,how='outer')
    return df_cum

def clean_df_cum(dct):
    df_cum=make_big_df(dct)
    df_fill=df_cum.fillna(0)
    df_fill.insert(1,'sums',df_fill.sum(axis=1)/84)
    fin_result=df_fill[[0,'sums']].sort_values(['sums'],ascending=False).head(20)
    return fin_result.values

clean_df_cum(dct)

array([['TIMES SQ-42 ST', 3151.29935515873],
       ['14 ST-UNION SQ', 2986.3298611111113],
       ['34 ST-HERALD SQ', 2375.5850694444443],
       ['FLUSHING-MAIN', 2036.685267857143],
       ['ATL AV-BARCLAY', 1687.7100694444446],
       ['JKSN HT-ROOSVLT', 1686.421875],
       ['59 ST COLUMBUS', 1648.171875],
       ['BEDFORD AV', 1446.298859126984],
       ['59 ST', 1341.6450892857142],
       ['W 4 ST-WASH SQ', 1333.3482142857142],
       ['50 ST', 1105.7150297619048],
       ['145 ST', 1080.6830357142858],
       ['JAMAICA CENTER', 1060.3177083333333],
       ['7 AV', 862.4196428571429],
       ["B'WAY-LAFAYETTE", 745.077380952381],
       ['CHAMBERS ST', 720.2983630952381],
       ['KEW GARDENS', 716.6532738095239],
       ['8 AV', 685.797619047619],
       ['42 ST-BRYANT PK', 634.0885416666666],
       ['JUNCTION BLVD', 590.2388392857143]], dtype=object)

In [267]:
activity_by_time(5,18,dct)[:10]

[['TIMES SQ-42 ST', 4581.5625],
 ['34 ST-HERALD SQ', 4291.375],
 ['14 ST-UNION SQ', 3825.25],
 ['FLUSHING-MAIN', 2370.4375],
 ['59 ST COLUMBUS', 2202.1875],
 ['ATL AV-BARCLAY', 2042.875],
 ['W 4 ST-WASH SQ', 2026.4375],
 ['BEDFORD AV', 1990.0],
 ['JKSN HT-ROOSVLT', 1879.125],
 ['59 ST', 1597.9375]]