In [303]:
# imports a library 'pandas', names it as 'pd'
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
import datetime
from bisect import bisect

from IPython.display import Image

# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline

We want to write a small function which returns the list of weeks in an appropriate format.  The function will take 'month' and 'yrs_back' as an input and then return a list of weeks going 'yrs_back' number of years back.  For instance, get_weeks_nums(3,2) gives all the weeks in march for two years back, starting from the most recent dataset.

In [304]:
def fix_time(num):
    if len(str(num)) == 2:
        return str(num)
    else:
        return '0'+str(num)

def get_week_nums(month,yrs_back):
    week_list=[]
    ref_date=datetime.date(2018,6,30)
    weeks_back=yrs_back*52
    for i in range(weeks_back):
        week_shift=datetime.timedelta(-7*i)
        new=ref_date+week_shift
        yr=str(new.year)[-2:]
        mt=fix_time(new.month)
        day=fix_time(new.day)
        string=yr+mt+day
        if int(mt)==month:
            week_list.append(int(string))
    return week_list

In [305]:
def scrape(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)

#I am going to focus on June for the last 3 years.
week_nums = get_week_nums(6,3)
df = scrape(week_nums)

In [2]:
#df = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_180630.txt')

In [306]:
cols={x:x.strip() for x in df.columns}
df_ren=df.rename(columns=cols)
#df_small=df_ren[:1000]
df_small=df_ren

Next we want to extract a time stampe from a given row in the dataframe.  The following is an example of how to do this for a particular row.  In the next frame we do this for each row.

In [52]:
pd.to_datetime(df_small.iloc[1][6]+' '+df_small.iloc[1][7],infer_datetime_format=True)

Timestamp('2018-06-23 04:00:00')

In [307]:
def get_datetime(x):
    return pd.to_datetime(x[6]+' '+x[7],infer_datetime_format=True)

In [308]:
df_small['datetime']=df_small.apply(get_datetime,axis=1)
#df_small.iloc[:100000].apply(get_datetime,axis=1)

Next we do some cleaning.  The function 'fn' allows us to get rid of elements which don't have zero secons.  These appear to be spurious.

In [309]:
def fn(row):
    return row['datetime'].second==0

df_small_clean=df_small[df_small.apply(fn,axis=1)].drop_duplicates(subset=['C/A','UNIT','SCP','STATION','LINENAME','datetime'])

At this point we should group by station and datetime **then** resample to make sure that we only have proper intervales of 4.

In [310]:
df_small_clean2=df_small_clean.groupby(['STATION','datetime'])[['EXITS']].sum()
df_small_clean3=df_small_clean2.reset_index()

Now we need to get diffs for each station, which means we need to find a list of stations, go by each one, sort by datetime, resample do a diff operation, etc.  The following takes in a dataframe like df_small_clean3 and a station and then gives back a dataframe for that particular station with the indices sorted by datetime.

In [311]:
def station_activity(df,station):
    df_station=df[df['STATION']==station]
    df_sort=df_station.sort_values(by=['datetime'])
    return df_sort

In [85]:
station_activity(df_small_clean3,'59 ST').head()

Unnamed: 0,STATION,datetime,EXITS
58360,59 ST,2016-05-28 00:00:00,955460529
58361,59 ST,2016-05-28 04:00:00,955461609
58362,59 ST,2016-05-28 08:00:00,955464213
58363,59 ST,2016-05-28 12:00:00,955471806
58364,59 ST,2016-05-28 16:00:00,955480958


OK, now for each station, apply a re-sample and then a diff map and drop the first element.  First, make a dataframe for each station, combine these in a list.

In writing the following definitions we are requring the seconds to be zero.  This is just a simple way to get rid of the wierd entries.

Note that in the next iteration we could instead do the following:  Resample both datasets hourly.  Simply take averages within each time window and then treat both datasets uniformally.  Still nead to drop wierd times.  

Next, go by station and for each station, apply the diff operation, obtaining a list of dataframes containing the diff values.

In [312]:
stations=df_small_clean3['STATION'].unique()
station_diffs_dct={}
for st in stations:
    sa=station_activity(df_small_clean3,st)
    sa['diffs']=pd.DataFrame(sa['EXITS'].diff())
    st_diffs=sa.drop(['EXITS'],axis=1).dropna()
    st_diffs_clean=st_diffs[np.abs(st_diffs.diffs)<10**6]
    station_diffs_dct[st]=st_diffs_clean

In [313]:
station_diffs_dct['59 ST'].head()

Unnamed: 0,STATION,datetime,diffs
51860,59 ST,2016-05-28 04:00:00,1080.0
51861,59 ST,2016-05-28 08:00:00,2604.0
51862,59 ST,2016-05-28 12:00:00,7593.0
51863,59 ST,2016-05-28 16:00:00,9152.0
51864,59 ST,2016-05-28 20:00:00,7698.0


Need to add a column for weekday and an hour, dropna, group by weekday and hour. Note that many of the cleaning steps below could be incorporated above when the dictionary is constructed.

In [314]:
dct_clean={}
for st in stations:
    st_df=station_diffs_dct[st].dropna()
    st_df_filt=st_df[np.abs(st_df.diffs)<20000]
    st_df_filt['weekday']=st_df_filt['datetime'].apply(lambda x: x.weekday())
    st_df_filt['hour']=st_df_filt['datetime'].apply(lambda x: x.hour)
    st_df_clean2=st_df_filt.groupby(['weekday','hour'])['diffs'].mean()
    dct_clean[st]=st_df_clean2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [280]:
station_diffs_dct['103 ST'].head()

Unnamed: 0,STATION,datetime,diffs
557,103 ST,2016-05-28 05:00:00,651.0
558,103 ST,2016-05-28 09:00:00,1091.0
559,103 ST,2016-05-28 13:00:00,2288.0
560,103 ST,2016-05-28 17:00:00,3323.0
561,103 ST,2016-05-28 21:00:00,3563.0


In [315]:
def find_key(num,hours_list):
    sort=sorted(hours_list)
    pos=bisect(sort,num)
    if pos < len(sort):
        return sort[pos]
    else:
        return sort[0]

In [316]:
def activity_by_time(day,hour):
    #We must convert a 24hour of the day to one of 0,4,8...
    exits=[]
    for st in stations:
        try:
            hours_st=[x.hour for x in station_diffs_dct[st]['datetime'][:6]]
            sh=sorted(hours_st)
            h_key=find_key(hour,sh)
            leaving=dct_clean[st][day][h_key]/4
            exits.append([st,leaving])
        except(KeyError,IndexError):
            pass
    sort_exits=sorted(exits,key=lambda x: x[1])[::-1]    
    return sort_exits

When inputting the time remember that Monday is zero, Tuesday is 1, etc.  The hour may be input as an integer and it internally converts to the appropriate time interval for the given station using the find_key function.

In [321]:
activity_by_time(5,10)[:10]

[['14 ST-UNION SQ', 4025.0416666666665],
 ['34 ST-HERALD SQ', 3583.6666666666665],
 ['TIMES SQ-42 ST', 2917.2916666666665],
 ['59 ST COLUMBUS', 2058.769230769231],
 ['ATL AV-BARCLAY', 2041.7692307692307],
 ['59 ST', 1827.9615384615386],
 ['W 4 ST-WASH SQ', 1719.6153846153845],
 ['FLUSHING-MAIN', 1680.1153846153845],
 ['72 ST', 1350.55],
 ['8 AV', 1300.326923076923]]

In [318]:
dct_clean['40 ST LOWERY ST'][3][20]

3251.3846153846152