In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
import csv

In [10]:
is_exists = os.path.exists("../data/turnstile/station_rideship")
is_exists_2 = os.path.exists("../data/turnstile/date_rideship")

if not is_exists:
        os.makedirs("../data/turnstile/station_rideship")
        print ("Create stations dir")

if not is_exists_2:
        os.makedirs("../data/turnstile/date_rideship")
        print ("Create date dir")

Create stations dir
Create date dir


In [11]:
data_path = '../data/turnstile'
turnstile_path = os.path.join(data_path, 'raw')
station_dir = os.path.join(data_path, 'station_rideship')
date_sum = os.path.join(data_path, 'date_rideship')

In [16]:
def flush_stations(stations):
    for station_name, rows in stations.items():
        
        name = ''.join(chars for chars in station_name if chars.isalnum())
        file_path = os.path.join(station_dir, name)
        file_path = file_path + '.csv'
        
        with open(file_path, mode = 'a+') as f:
            
            ff = csv.writer(f)
            ff.writerows(rows)

In [17]:
for file_name in os.listdir(turnstile_path):
    
    file_path = os.path.join(turnstile_path, file_name)
    stations = {}
    
    with open(file_path) as f:
        
        reader = csv.DictReader(f)
        
        for row in reader:
            if not row['DESC'] in ['RECOVER AUD', 'REGULAR']:
                continue
            
            res = [
                row['STATION'],
                row['DIVISION'] + row['UNIT'] + row['SCP'],
                row['DATE'],
                row['TIME'],
                row['ENTRIES']
            ]
            
            if not row['STATION'] in stations:
                stations[row['STATION']] = [res]
            else:
                stations[row['STATION']] += [res]
    
    flush_stations(stations)
    
print("Station Sum Complete !")

Station Sum Complete !


In [18]:
def cal_rideship(df):
    
    df_cal = df.resample('H').max().interpolate().ffill()
    df_cal['rideship'] = df_cal['entries'] - df_cal['entries'].shift(1)
    
    df_cal.loc[df_cal['rideship'] < 0, 'rideship'] = np.nan
    df_cal.loc[df_cal['rideship'] > 10000, 'rideship'] = np.nan
    
    res = df_cal.interpolate().dropna()
    
    return res

In [19]:
for file_name in os.listdir(station_dir):
    
    col_names = ['station', 'distinction', 'date', 'time', 'entries']
    data = pd.read_csv(os.path.join(station_dir, file_name), names = col_names)
    
    data['date_time'] = pd.to_datetime(data['date']+ ' ' +data['time'], format = '%m/%d/%Y %H:%M:%S')
    data.drop(columns = ['date', 'time'], inplace = True)
    data.set_index('date_time', inplace = True)
    
    rideship = data.groupby(['station', 'distinction'], group_keys=False).apply(cal_rideship)
    hour_rideship = rideship.groupby(['station', 'date_time'])['rideship'].sum().reset_index()
    date_rideship = hour_rideship.groupby(['station', pd.Grouper(key = 'date_time', freq = 'D')]).sum().reset_index()
    date_rideship.to_csv(os.path.join(date_sum, file_name), index = False, header = False)

    print('Processed %s'  % file_name)

print("Preprocess Complete !!")

Processed 103ST.csv
Processed 103STCORONA.csv
Processed 104ST.csv
Processed 110ST.csv
Processed 111ST.csv
Processed 116ST.csv
Processed 116STCOLUMBIA.csv
Processed 121ST.csv
Processed 125ST.csv
Processed 135ST.csv
Processed 137STCITYCOL.csv
Processed 138GRANDCONC.csv
Processed 145ST.csv
Processed 149GRANDCONC.csv
Processed 14ST.csv
Processed 14STUNIONSQ.csv
Processed 14THSTREET.csv
Processed 155ST.csv
Processed 157ST.csv
Processed 15STPROSPECT.csv
Processed 161YANKEESTAD.csv
Processed 163STAMSTERDM.csv
Processed 167ST.csv
Processed 168ST.csv
Processed 169ST.csv
Processed 170ST.csv
Processed 174175STS.csv
Processed 174ST.csv
Processed 175ST.csv
Processed 176ST.csv
Processed 181ST.csv
Processed 182183STS.csv
Processed 183ST.csv
Processed 18AV.csv
Processed 18ST.csv
Processed 190ST.csv
Processed 191ST.csv
Processed 1AV.csv
Processed 207ST.csv
Processed 20AV.csv
Processed 215ST.csv
Processed 219ST.csv
Processed 21ST.csv
Processed 21STQNSBRIDGE.csv
Processed 225ST.csv
Processed 231ST.csv
Pr

Processed STERLINGST.csv
Processed STGEORGE.csv
Processed STLAWRENCEAV.csv
Processed SUTPHINARCHER.csv
Processed SUTPHINBLVD.csv
Processed SUTTERAV.csv
Processed SUTTERAVRUTLD.csv
Processed THIRTYST.csv
Processed THIRTYTHIRDST.csv
Processed TIMESSQ42ST.csv
Processed TOMPKINSVILLE.csv
Processed TREMONTAV.csv
Processed TWENTYTHIRDST.csv
Processed UNIONST.csv
Processed UTICAAV.csv
Processed VANSICLENAV.csv
Processed VANSICLENAVE.csv
Processed VCORTLANDTPK.csv
Processed VERNONJACKSON.csv
Processed W4STWASHSQ.csv
Processed W8STAQUARIUM.csv
Processed WAKEFIELD241.csv
Processed WALLST.csv
Processed WESTCHESTERSQ.csv
Processed WESTFARMSSQ.csv
Processed WHITEHALLSFRY.csv
Processed WHITLOCKAV.csv
Processed WILSONAV.csv
Processed WINTHROPST.csv
Processed WOODHAVENBLVD.csv
Processed WOODLAWN.csv
Processed WORLDTRADECTR.csv
Processed WTCCORTLANDT.csv
Processed YORKST.csv
Processed ZEREGAAV.csv
Preprocess Complete !!
