# NYC MTA Turnstile Dataset Cleanup & EDA

### I) Setting Up

In [214]:
from IPython.display import display
import warnings
import calendar
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [215]:
# Source - http://web.mta.info/developers/data/nyct/turnstile/turnstile_180922.txt
# Source description - http://web.mta.info/developers/resources/nyct/turnstile/ts_Field_Description.txt

# Source: https://catalog.data.gov/dataset/demographic-statistics-by-zip-code-acfc9
# gender_df = pd.read_csv('gender_by_zipcode.csv')

In [216]:
# Process and dump a single file
def delta_processor(filename):
    df = pd.read_csv(filename)
    
    #Cleanup column names - remove spaces
    df.rename(columns=lambda x: x.strip(), inplace=True)

    # Cleaning datetime
    print("Cleaning up date time")
    df.loc[:,'DATETIME'] = pd.to_datetime(df['DATE'] + ' ' + df['TIME'])
    df['DAY'] = df['DATETIME']
    df['DAY'] = df['DAY'].apply(lambda x : calendar.day_name[x.weekday()])
    
    # diff entry and exit and update dataframe
    print('Calculate running delta for entry/exit time')
    entries_diff = df['ENTRIES'].diff()
    entries_diff = entries_diff.clip_lower(0)
    df['ENTRIES_DELTA'] = entries_diff
    exits_diff = df['EXITS'].diff()
    exits_diff = exits_diff.clip_lower(0)
    df['EXITS_DELTA'] = exits_diff
    df.fillna(0, inplace=True)
    
    # First entry/exit delta for each day in each station and SCP will be incorrect. Drop these results
    # Next, there are chances for spurious deltas if data is not correct, ignore such outliers
    print('Removing spurious records')
    df = df.groupby(['STATION', 'SCP', 'DATE'], group_keys=False).apply(lambda group: group.iloc[1:])
    df = df[df['ENTRIES_DELTA'] < 10000]
    df = df[df['EXITS_DELTA'] < 10000]
    
    #TODO: Write to CSV
    # 'abc.csv' -> 'abc_delta_processed.csv'
    return df 
    

In [None]:
turnstile_df = delta_processor('turnstile_180922.txt')

Cleaning up date time
Calculate running delta for entry/exit time
Removing spurious records


## Analysis:

In [None]:
# Print mean number of entries/exits by day
turnstile_df.groupby(['DAY'])['ENTRIES_DELTA', 'EXITS_DELTA'].mean()

In [None]:
# Print mean number of entries/exits by hour
turnstile_df.groupby(turnstile_df['DATETIME'].dt.hour)['ENTRIES_DELTA', 'EXITS_DELTA'].mean()