In [241]:
# imports a library 'pandas', names it as 'pd'
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
import datetime
from bisect import bisect
import pickle
import csv
filename = '/home/williamcottrell72/github/sf18_ds11/class_lectures/week01-benson/02-git_viz/turnstiles.pkl'
from IPython.display import Image

# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline


def fix_time(num):
    if len(str(num)) == 2:
        return str(num)
    else:
        return '0'+str(num)

def get_week_nums(month,yrs_back):
    week_list=[]
    ref_date=datetime.date(2018,6,30)
    weeks_back=yrs_back*52
    for i in range(weeks_back):
        week_shift=datetime.timedelta(-7*i)
        new=ref_date+week_shift
        yr=str(new.year)[-2:]
        mt=fix_time(new.month)
        day=fix_time(new.day)
        string=yr+mt+day
        if int(mt)==month:
            week_list.append(int(string))
    return week_list

def scrape_pkl(week_nums,filename):
    try:
        with open(filename,'rb') as pklfile:
            df = pickle.load(pklfile)

    except:

        df = scrape(week_nums)

        with open(filename,'wb') as pklfile:
            df = pickle.dump(df, pklfile)
    return df



def fn(row):
    return (row['datetime'].second==0)&(row['datetime'].minute==0)

def clean_df(df):
    cols={x:x.strip() for x in df.columns}
    df_small=df.rename(columns=cols)
    df_small['datetime']=pd.to_datetime(df_small['DATE']+' '+df_small['TIME'],infer_datetime_format=True)
    df_small_clean=df_small[df_small.apply(fn,axis=1)].drop_duplicates(subset=['C/A','UNIT','SCP','STATION','LINENAME','datetime'])
    df_small_clean2=df_small_clean.groupby(['STATION','datetime'],as_index=False)[['EXITS']].sum()
    return df_small_clean2

def station_activity(df,station):
    df_station=df[df['STATION']==station]
    df_sort=df_station.sort_values(by=['datetime'])
    return df_sort

def construct_dct(df):
    stations=df['STATION'].unique()
    station_diffs_dct={}
    for st in stations:
        sa=station_activity(df,st)
        sa['diffs']=pd.DataFrame(sa['EXITS'].diff())
        st_diffs=sa.drop(['EXITS'],axis=1).dropna()
        st_diffs_clean=st_diffs[np.abs(st_diffs.diffs)<30*10**3]    
        st_diffs_clean['weekday']=st_diffs_clean['datetime'].dt.weekday
        st_diffs_clean['hour']=st_diffs_clean['datetime'].dt.hour
        st_diffs_clean2=st_diffs_clean.groupby(['weekday','hour'])['diffs'].mean()
        station_diffs_dct[st]=st_diffs_clean2
    return station_diffs_dct

def find_key(num,hours_list):
    sort=sorted(hours_list)
    pos=bisect(sort,num)
    if pos < len(sort):
        return sort[pos]
    else:
        return sort[0]
    
def activity_by_time(day,hour,dct):
    stations=list(set(dct.keys()))
    exits=[]
    for st in stations:
        try:
            """Below we are assuming that the hour list appearing is consistent for
            a given station. (I take the first element of the list as being represent-
            ative.) Should check this"""
            hours_st=dct[st][0].keys().values
            sh=sorted(hours_st)
            h_key=find_key(hour,sh)
            leaving=dct[st][day][h_key]/4
            """
            Below I apply np.abs to account for the possibility that the turnstile was
            reversed for some period of time.
            """             
            exits.append([st,np.abs(leaving)])
        except(KeyError,IndexError,AttributeError):
            pass
    sort_exits=sorted(exits,key=lambda x: x[1])[::-1]    
    return sort_exits


def main(month,day,hour,yrs_back=3):
    week_nums=get_week_nums(month,yrs_back)
    df=scrape_pkl(week_nums,filename)
    df_c=clean_df(df)
    dct=construct_dct(df_c)
    
    return (activity_by_time(day,hour,dct),dct,df)

In [242]:
activity,dct,df = main(6,4,5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [243]:
cols={x:x.strip() for x in df.columns}
df_small=df.rename(columns=cols)
df_tmp=df_small[df_small.STATION=='CORTLANDT ST'].groupby(['DATE','TIME','STATION'],as_index=False)['EXITS'].sum()

In [245]:
df_tmp3.head()

Unnamed: 0,EXITS
1,49.0
2,236.0
4,1163.0
5,908.0
6,318.0


In [247]:
df_tmp2=pd.DataFrame(df_tmp['EXITS'].diff()).dropna()
df_tmp3=df_tmp2[np.abs(df_tmp2.EXITS) < 3*10**3]
np.mean([np.abs(x)[0] for x in df_tmp3.values])/4

219.72752808988764

In [248]:
df_tmp['diffs']=df_tmp['EXITS'].diff().dropna()

In [249]:
df_tmp[(np.abs(df_tmp.diffs)<10**6) & (np.abs(df_tmp.diffs)>10000)]

Unnamed: 0,DATE,TIME,STATION,EXITS,diffs
52,07/09/2017,17:00:00,CORTLANDT ST,922961559,-636970.0
53,07/09/2017,21:00:00,CORTLANDT ST,923603014,641455.0
57,07/10/2017,13:00:00,CORTLANDT ST,922972564,-635641.0
58,07/10/2017,17:00:00,CORTLANDT ST,923614932,642368.0
66,07/11/2017,21:00:00,CORTLANDT ST,922994426,-636184.0
73,07/12/2017,21:00:00,CORTLANDT ST,923649947,643390.0
76,07/13/2017,09:00:00,CORTLANDT ST,923015179,-635595.0
77,07/13/2017,13:00:00,CORTLANDT ST,923659208,644029.0
78,07/13/2017,17:00:00,CORTLANDT ST,923022533,-636675.0


In [143]:
pd.merge(new_df1,new_df2,on=0,how='outer').head()

Unnamed: 0,0,1_x,1_y
0,82 ST-JACKSON H,38335.125,156.3125
1,49 ST,27808.0625,67.8125
2,VAN SICLEN AVE,11242.375,
3,JKSN HT-ROOSVLT,562.6875,514.3125
4,JAMAICA CENTER,345.416667,355.3125


In [252]:
df=pd.DataFrame(activity_by_time(0,12,dct)[:20])
for i in range(1,12):
    for day in range(1,7):
        df2=pd.DataFrame(activity_by_time(day,12+i,dct)[:20])
        df=pd.merge(df,df2,on=0,how='outer')

In [253]:
df_fill=df.fillna(0)

In [254]:
#df_fill['sums']=df_fill.sum(axis=1)
df_fill.insert(1,'sums',df_fill.sum(axis=1)/84)

In [255]:
fin_result=df_fill[[0,'sums']].sort_values(['sums'],ascending=False).head(20)

In [256]:
fin_result.values

array([['TIMES SQ-42 ST', 3148.2170138888882],
       ['14 ST-UNION SQ', 2986.3298611111113],
       ['34 ST-HERALD SQ', 2375.5850694444443],
       ['FLUSHING-MAIN', 2041.5840773809523],
       ['ATL AV-BARCLAY', 1703.2532242063494],
       ['JKSN HT-ROOSVLT', 1686.421875],
       ['59 ST COLUMBUS', 1647.1599702380952],
       ['BEDFORD AV', 1446.298859126984],
       ['59 ST', 1339.8407738095239],
       ['W 4 ST-WASH SQ', 1333.3482142857142],
       ['50 ST', 1105.9104662698414],
       ['JAMAICA CENTER', 1059.8236607142858],
       ['145 ST', 1057.563988095238],
       ['7 AV', 949.905505952381],
       ["B'WAY-LAFAYETTE", 746.5009920634922],
       ['8 AV', 738.1622023809524],
       ['CHAMBERS ST', 721.9866071428571],
       ['42 ST-BRYANT PK', 634.0885416666666],
       ['JUNCTION BLVD', 590.2388392857143],
       ['47-50 STS ROCK', 579.7120535714286]], dtype=object)

In [129]:
sorted(final_result, key = lambda x: x[1])[::-1]

[['82 ST-JACKSON H', 38335.125],
 ['49 ST', 27808.0625],
 ['VAN SICLEN AVE', 22484.75],
 ['34 ST-HERALD SQ', 2839.6875],
 ['TIMES SQ-42 ST', 2455.375],
 ['47-50 STS ROCK', 2101.6875],
 ['59 ST', 1854.0625],
 ['LEXINGTON AV/53', 1704.3125],
 ['59 ST COLUMBUS', 1652.375],
 ['42 ST-BRYANT PK', 1193.875],
 ['CHAMBERS ST', 1151.9375],
 ['50 ST', 1124.1875],
 ['JKSN HT-ROOSVLT', 562.6875],
 ['JAMAICA CENTER', 345.4166666666667],
 ['FLUSHING-MAIN', 324.1875],
 ['103 ST-CORONA', 289.8125],
 ['145 ST', 269.3333333333333],
 ['BROAD ST', 0],
 ['36 ST', 0],
 ['7 AV', 0],
 ['4AV-9 ST', 0],
 ['GREENPOINT AV', 0],
 ['KOSCIUSZKO ST', 0],
 ['137 ST CITY COL', 0],
 ['14 ST-UNION SQ', 0],
 ['5 AV/53 ST', 0],
 ['JAMAICA 179 ST', 0],
 ['169 ST', 0],
 ['JFK JAMAICA CT1', 0],
 ['RIT-MANHATTAN', 0]]

In [147]:
list(range(1,12))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]