In [1]:
# imports a library 'pandas', names it as 'pd'
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
import datetime
from bisect import bisect
import pickle
import csv
#filename = '/home/williamcottrell72/github/sf18_ds11/class_lectures/week01-benson/02-git_viz/turnstiles_v2.pkl'
from IPython.display import Image

In [2]:
def fix_time(num):
    if len(str(num)) == 2:
        return str(num)
    else:
        return '0'+str(num)

In [3]:
def get_week_nums(month,yrs_back):
    week_list=[]
    ref_date=datetime.date(2018,6,30)
    weeks_back=yrs_back*52
    for i in range(weeks_back):
        week_shift=datetime.timedelta(-7*i)
        new=ref_date+week_shift
        yr=str(new.year)[-2:]
        mt=fix_time(new.month)
        day=fix_time(new.day)
        string=yr+mt+day
        if (int(mt)==month):
            week_list.append(int(string))
    return week_list

In [4]:
def scrape(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)

In [5]:
def scrape_pkl(week_nums,filename):
    try:
        with open(filename,'rb') as pklfile:
            df = pickle.load(pklfile)
    except:
        df = scrape(week_nums)

        with open(filename,'wb') as pklfile:
            df = pickle.dump(df, pklfile)
    return df

In [6]:
def clean_df(df):
    cols={x:x.strip() for x in df.columns}
    df_small=df.rename(columns=cols)
    df_small['datetime']=pd.to_datetime(df_small['DATE']+' '+df_small['TIME'],infer_datetime_format=True)
    df_small_clean=df_small[df_small.DIVISION!='PTH'].drop_duplicates(subset=['C/A','UNIT','SCP','STATION','LINENAME','datetime'])
    df_small_clean2=df_small_clean.groupby(['STATION','datetime'],as_index=False)[['EXITS']].sum()
    return df_small_clean2

In [7]:
def station_activity(df,station):
    df_station=df[df['STATION']==station]
    df_sort=df_station.sort_values(by=['datetime'])
    return df_sort

In [8]:
def construct_dct(df,mt):
    stations=df['STATION'].unique()
    station_diffs_dct={}
    for st in stations:
        sa=station_activity(df,st)
        sa['diffs']=sa['EXITS'].diff()
        #sa2=month_filter(sa,mt)
        st_diffs=sa.drop(['EXITS'],axis=1).dropna()
        st_diffs_clean=st_diffs[np.abs(st_diffs.diffs)<10*10**4]        
        st_diffs_clean['weekday']=st_diffs_clean['datetime'].dt.weekday
        st_diffs_clean['hour']=st_diffs_clean['datetime'].dt.hour
        st_diffs_clean2=st_diffs_clean.groupby(['weekday','hour'])['diffs'].mean()
        station_diffs_dct[st]=st_diffs_clean2
    return station_diffs_dct

In [9]:
def find_key(num,hours_list):
    sort=sorted(hours_list)
    pos=bisect(sort,num)
    if pos < len(sort):
        return sort[pos]
    else:
        return sort[0]

In [10]:
def activity_by_time(day,hour,dct):
    stations=list(set(dct.keys()))
    exits=[]
    for st in stations:
        try:
            """Below we are assuming that the hour list appearing is consistent for
            a given station. (I take the first element of the list as being represent-
            ative.) Should check this"""
            hours_st=dct[st][0].keys().values
            sh=sorted(hours_st)
            h_key=find_key(hour,sh)
            leaving=dct[st][day][h_key]/4
            """
            Below I apply np.abs to account for the possibility that the turnstile was
            reversed for some period of time.
            """             
            exits.append([st,np.abs(leaving)])
        except(KeyError,IndexError,AttributeError):
            pass
    sort_exits=sorted(exits,key=lambda x: x[1])[::-1]    
    return sort_exits

In [11]:
    week_nums=get_week_nums(7,3)

In [12]:
df=scrape(week_nums)

In [13]:
df_c=clean_df(df)

In [14]:
dct=construct_dct(df_c,7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [15]:
activity_by_time(4,3,dct)

[['VERNON-JACKSON', 1174.95],
 ['ROOSEVELT AVE', 676.5833333333334],
 ['JKSN HT-ROOSVLT', 562.325],
 ['231 ST', 469.2307692307692],
 ['145 ST', 429.375],
 ['JAMAICA CENTER', 376.5],
 ['MAIN ST', 355.1666666666667],
 ['FLUSHING-MAIN', 349.0],
 ['42 ST-TIMES SQ', 347.8333333333333],
 ['TIMES SQ-42 ST', 339.975],
 ['103 ST-CORONA', 296.9230769230769],
 ['BEDFORD AVE', 290.8333333333333],
 ['JUNCTION BLVD', 269.7692307692308],
 ['34 ST-HERALD SQ', 268.71153846153845],
 ['MYRTLE-WYCKOFF', 247.1],
 ['CROWN HTS-UTICA', 229.57692307692307],
 ['BARCLAYS CENTER', 228.75],
 ['BEDFORD AV', 223.95],
 ['82 ST-JACKSON H', 214.41666666666666],
 ['DITMARS BL-31 S', 211.33333333333334],
 ['167 ST', 200.67307692307693],
 ['59 ST', 199.78846153846155],
 ['ASTORIA DITMARS', 195.775],
 ['SUTPHIN-ARCHER', 192.675],
 ['NOSTRAND AV', 190.83333333333334],
 ['NOSTRAND AVE', 186.66666666666666],
 ['1 AVE', 186.41666666666666],
 ['59 ST-COLUMBUS', 183.33333333333334],
 ['168 ST-BROADWAY', 176.0],
 ['ATL AV-BARCLAY