In [70]:
# imports a library 'pandas', names it as 'pd'
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
import datetime
from bisect import bisect
import pickle
import csv
filename = '/home/williamcottrell72/github/sf18_ds11/class_lectures/week01-benson/02-git_viz/turnstiles.pkl'
from IPython.display import Image

# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline


def fix_time(num):
    if len(str(num)) == 2:
        return str(num)
    else:
        return '0'+str(num)

def get_week_nums(month,yrs_back):
    week_list=[]
    ref_date=datetime.date(2018,6,30)
    weeks_back=yrs_back*52
    for i in range(weeks_back):
        week_shift=datetime.timedelta(-7*i)
        new=ref_date+week_shift
        yr=str(new.year)[-2:]
        mt=fix_time(new.month)
        day=fix_time(new.day)
        string=yr+mt+day
        if int(mt)==month:
            week_list.append(int(string))
    return week_list

def scrape_pkl(week_nums,filename):
    try:
        with open(filename,'rb') as pklfile:
            df = pickle.load(pklfile)

    except:

        df = scrape(week_nums)

        with open(filename,'wb') as pklfile:
            df = pickle.dump(df, pklfile)
    return df



def fn(row):
    return (row['datetime'].second==0)&(row['datetime'].minute==0)

def clean_df(df):
    cols={x:x.strip() for x in df.columns}
    df_small=df.rename(columns=cols)
    df_small['datetime']=pd.to_datetime(df_small['DATE']+' '+df_small['TIME'],infer_datetime_format=True)
    df_small_clean=df_small[df_small.apply(fn,axis=1)].drop_duplicates(subset=['C/A','UNIT','SCP','STATION','LINENAME','datetime'])
    df_small_clean2=df_small_clean.groupby(['STATION','datetime'],as_index=False)[['EXITS']].sum()
    return df_small_clean2

def station_activity(df,station):
    df_station=df[df['STATION']==station]
    df_sort=df_station.sort_values(by=['datetime'])
    return df_sort

def construct_dct(df):
    stations=df['STATION'].unique()
    station_diffs_dct={}
    for st in stations:
        sa=station_activity(df,st)
        sa['diffs']=pd.DataFrame(sa['EXITS'].diff())
        st_diffs=sa.drop(['EXITS'],axis=1).dropna()
        st_diffs_clean=st_diffs[np.abs(st_diffs.diffs)<10**6]    
        st_diffs_clean['weekday']=st_diffs_clean['datetime'].dt.weekday
        st_diffs_clean['hour']=st_diffs_clean['datetime'].dt.hour
        st_diffs_clean2=st_diffs_clean.groupby(['weekday','hour'])['diffs'].mean()
        station_diffs_dct[st]=st_diffs_clean2
    return station_diffs_dct

def find_key(num,hours_list):
    sort=sorted(hours_list)
    pos=bisect(sort,num)
    if pos < len(sort):
        return sort[pos]
    else:
        return sort[0]
    
def activity_by_time(day,hour,dct):
    stations=list(set(dct.keys()))
    exits=[]
    for st in stations:
        try:
            """Below we are assuming that the hour list appearing is consistent for
            a given station. (I take the first element of the list as being represent-
            ative.) Should check this"""
            hours_st=dct[st][0].keys().values
            sh=sorted(hours_st)
            h_key=find_key(hour,sh)
            leaving=dct[st][day][h_key]/4
            """
            Below I apply np.abs to account for the possibility that the turnstile was
            reversed for some period of time.
            """             
            exits.append([st,np.abs(leaving)])
        except(KeyError,IndexError,AttributeError):
            pass
    sort_exits=sorted(exits,key=lambda x: x[1])[::-1]    
    return sort_exits


def main(month,day,hour,yrs_back=3):
    week_nums=get_week_nums(month,yrs_back)
    df=scrape_pkl(week_nums,filename)
    df_c=clean_df(df)
    dct=construct_dct(df_c)
    
    return (activity_by_time(day,hour,dct),dct)

In [71]:
activity,dct = main(6,4,5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [73]:
#

[['JFK JAMAICA CT1', 11461.5],
 ['14 ST-UNION SQ', 4026.25],
 ['RIT-MANHATTAN', 2734.1875],
 ['34 ST-HERALD SQ', 2545.1875],
 ['TIMES SQ-42 ST', 2124.75],
 ['47-50 STS ROCK', 2101.6875],
 ['7 AV', 1959.6666666666667],
 ['59 ST', 1854.0625],
 ['LEXINGTON AV/53', 1704.3125],
 ['59 ST COLUMBUS', 1652.375],
 ['72 ST-2 AVE', 1603.1875],
 ['W 4 ST-WASH SQ', 1496.875],
 ['33 ST', 1258.75],
 ['42 ST-BRYANT PK', 1193.875],
 ['WHITEHALL S-FRY', 1153.4375],
 ['CHAMBERS ST', 1151.9375],
 ['50 ST', 1124.1875],
 ["B'WAY-LAFAYETTE", 1117.8125],
 ['68ST-HUNTER CO', 1100.875],
 ['57 ST', 981.1875],
 ['8 AV', 946.0625],
 ['CORTLANDT ST', 915.9375],
 ['BOWLING GREEN', 890.0],
 ['FLUSHING-MAIN', 794.625],
 ['QUEENS PLAZA', 775.9166666666666],
 ['103 ST', 768.5625],
 ['DELANCEY/ESSEX', 745.25],
 ['1 AV', 728.3125],
 ['66 ST-LINCOLN', 718.0],
 ['51 ST', 711.4375],
 ['49 ST', 701.0],
 ['JAY ST-METROTEC', 674.8125],
 ['GRAND ST', 655.1875],
 ['34 ST-HUDSON YD', 629.25],
 ['168 ST', 609.125],
 ['JAMAICA CENTER