# NYC MTA Turnstile Dataset Cleanup & EDA

### I) Setting Up

In [214]:
from IPython.display import display
import warnings
import calendar
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [215]:
# Source - http://web.mta.info/developers/data/nyct/turnstile/turnstile_180922.txt
# Source description - http://web.mta.info/developers/resources/nyct/turnstile/ts_Field_Description.txt

# Source: https://catalog.data.gov/dataset/demographic-statistics-by-zip-code-acfc9
# gender_df = pd.read_csv('gender_by_zipcode.csv')

In [317]:
# Process and dump a single file
def delta_processor(filename):
    df = pd.read_csv(filename)
    
    #Cleanup column names - remove spaces
    df.rename(columns=lambda x: x.strip(), inplace=True)

    # Cleaning datetime
    print("Cleaning up date time")
    df.loc[:,'DATETIME'] = pd.to_datetime(df['DATE'] + ' ' + df['TIME'])
    df['DAY'] = df['DATETIME']
    df['DAY'] = df['DAY'].apply(lambda x : calendar.day_name[x.weekday()])
    
    # diff entry and exit and update dataframe
    print('Calculate running delta for entry/exit time')
    entries_diff = df['ENTRIES'].diff()
    entries_diff = entries_diff.clip_lower(0)
    df['ENTRIES_DELTA'] = entries_diff
    exits_diff = df['EXITS'].diff()
    exits_diff = exits_diff.clip_lower(0)
    df['EXITS_DELTA'] = exits_diff
    df.fillna(0, inplace=True)
    
    # First entry/exit delta for each day in each station and SCP will be incorrect. Drop these results
    # Next, there are chances for spurious deltas if data is not correct, ignore such outliers
    print('Removing spurious records')
    df = df.groupby(['STATION', 'SCP', 'DATE'], group_keys=False).apply(lambda group: group.iloc[1:])
    df = df[df['ENTRIES_DELTA'] < 10000]
    df = df[df['EXITS_DELTA'] < 10000]
    
    #TODO: Write to CSV
    # 'abc.csv' -> 'abc_delta_processed.csv'
    return df 
    

In [318]:
turnstile_df = delta_processor('turnstile_180922.txt')

Cleaning up date time
Calculate running delta for entry/exit time
Removing spurious records


In [370]:
turnstile_df[[ 'UNIT','DIVISION', 'STATION', 'DATE', 'TIME','DATETIME', 'DAY', 'ENTRIES_DELTA', 'EXITS_DELTA']].head()

Unnamed: 0,UNIT,DIVISION,STATION,DATE,TIME,DATETIME,DAY,ENTRIES_DELTA,EXITS_DELTA
30341,R248,BMT,1 AV,09/15/2018,04:00:00,2018-09-15 04:00:00,Saturday,118.0,326.0
30342,R248,BMT,1 AV,09/15/2018,08:00:00,2018-09-15 08:00:00,Saturday,63.0,344.0
30343,R248,BMT,1 AV,09/15/2018,12:00:00,2018-09-15 12:00:00,Saturday,542.0,720.0
30344,R248,BMT,1 AV,09/15/2018,16:00:00,2018-09-15 16:00:00,Saturday,770.0,961.0
30345,R248,BMT,1 AV,09/15/2018,20:00:00,2018-09-15 20:00:00,Saturday,774.0,1178.0


In [371]:
zips = pd.read_csv("station_zips.csv")

In [372]:
def fix_names(name):
    temp_name = name.upper()
    replacements = [["1TH", "1"],["1ST", "1"], ["2ND", "2"],["3RD", "3"],["4TH", "4"],["5TH", "5"], 
                    ["6TH", "6"],["7TH", "7"],["8TH", "8"],["9TH", "9"],["0TH", "0"],["COLUMBIA UNIVERSITY", "COLUMBIA"],
                    ["AVENUE", "AV"], ["SQUARE", "SQ"], ["CENTER","CTR"], ["138 ST-3 AVE","3 AV 138 ST"],
                    ["149 ST-3 AV", "3 AV-149 ST"], ["137 ST-CITY COLLEGE", "137 ST CITY COL"], 
                    ["PROSPECT PARK-15 ST","15 ST-PROSPECT"], ['YANKEE STADIUM-161 ST', "161/YANKEE STAD"],
                    ['RAWSON ST-33 ST', "33 ST-RAWSON ST"], ['WHITEHALL ST', "WHITEHALL S-FRY"], 
                    ['WESTCHESTER SQ-EAST TREMONT AV',"WESTCHESTER SQ" ], ['WAKEFIELD-241 ST', "WAKEFIELD/241"],
                    ['WEST 8 ST', "W 8 ST-AQUARIUM"], ['WEST 4 ST', 'W 4 ST-WASH SQ'], 
                    ['VERNON BLVD-JACKSON AV',"VERNON-JACKSON" ],["5 AV-53 ST","5 AV/53 ST"],["5 AV","5 AVE"],
                    ["59 ST-COLUMBUS CIRCLE","59 ST COLUMBUS"],["WOODSIDE AV-61 ST","61 ST WOODSIDE"],
                    ["63 DRIVE-REGO PARK","63 DR-REGO PARK"],["66 ST-LINCOLN CTR","66 ST-LINCOLN"],
                    ["68 ST-HUNTER COLLEGE","68ST-HUNTER CO"],["AQUEDUCT-NORTH CONDUIT AV","AQUEDUCT N.COND"],
                    ["AQUEDUCT RACETRACK","AQUEDUCT RACETR"],["ASTOR PLACE","ASTOR PL"],
                    ["ASTORIA BLVD-HOYT AV","ASTORIA BLVD"],["DITMARS BLVD","ASTORIA DITMARS"],
                    ["ATLANTIC AV-BARCLAYS CTR","ATL AV-BARCLAY"],["BROADWAY-LAFAYETTE ST","B'WAY-LAFAYETTE"],
                    ["BAY PARKWAY","BAY PKWY"],["BEDFORD PARK BLVD","BEDFORD PK BLVD"],
                    ["BEDFORD-NOSTRAND AVS","BEDFORD-NOSTRAN"],["BEVERLY RD","BEVERLEY ROAD"],
                    ["BOTANIC GARDENS","BOTANIC GARDEN"],["BRIARWOOD-VAN WYCK BLVD","BRIARWOOD"],
                    ["BROADWAY JUNCTION-EAST NEW YORK","BROADWAY JCT"],["BROOKLYN BRIDGE-CITY HALL","BROOKLYN BRIDGE"],
                    ["CANARSIE - ROCKAWAY PARKWAY","CANARSIE-ROCKAW"],["CATHEDRAL PARKWAY-110 ST","CATHEDRAL PKWY"],
                    ["CLINTON-WASHINGTON AVS","CLINTON-WASH AV"],["DELANCEY ST","DELANCEY/ESSEX"],
                    ["EAST 143 ST-ST MARY'S ST","E 143/ST MARY'S"],["EAST 149 ST","E 149 ST"],
                    ["EAST 180 ST","E 180 ST"],["EASTCHESTER-DYRE AV","EASTCHSTER/DYRE"],
                    ["EASTERN PARKWAY-BROOKLYN MUSEUM","EASTN PKWY-MUSM"],["HARLEM-148 ST","HARLEM 148 ST"],
                    ["FAR ROCKAWAY-MOTT AV","FAR ROCKAWAY"],["FLATBUSH AV-BROOKLYN COLLEGE","FLATBUSH AV-B.C"],
                    ["FLUSHING-MAIN ST","FLUSHING-MAIN"],["FOREST AV","FOREST AVE"],
                    ["FOREST HILLS-71 AV","FOREST HILLS 71"],["FORT HAMILTON PARKWAY","FT HAMILTON PKY"],
                    ["GRAND ARMY PLAZA","GRAND ARMY PLAZ"],["GRAND AV-NEWTOWN","GRAND-NEWTOWN"],
                    ["HOWARD BEACH","HOWARD BCH JFK"],["HOYT & SCHERMERHORN","HOYT-SCHER"],
                    ["HUNTERS POINT","HUNTERS PT AV"],["INWOOD - 207 ST","INWOOD-207 ST"],
                    ["JAMAICA-179 ST","JAMAICA 179 ST"],["JACKSON HEIGHTS-ROOSEVELT AVE","JKSN HT-ROOSVLT"],
                    ["JAMAICA-VAN WYCK","JAMAICA VAN WK"],["KEW GARDENS-UNION TURNPIKE","KEW GARDENS"],
                    ["KINGS HIGHWAY","KINGS HWY"],["KNICKERBOCKER AV","KNICKERBOCKER"],["KOSCIUSKO ST","KOSCIUSZKO ST"],
                    ["NORWOOD-205 ST","NORWOOD 205 ST"],["OCEAN PARKWAY","OCEAN PKWY"],["PELHAM PARKWAY","PELHAM PKWY"],
                    ["QUEENSBORO PLAZA","QUEENSBORO PLZ"],["PARKCHESTER-EAST 177 ST","PARKCHESTER"],
                    ["LEXINGTON AV-53 ST","LEXINGTON AV/53"],["LEXINGTON AV","LEXINGTON AV/63"],
                    ["MARBLE HILL-225 ST","MARBLE HILL-225"],["METS - WILLETS POINT","METS-WILLETS PT"],
                    ["MORRISON AV-SOUNDVIEW AV","MORISN AV/SNDVW"],["MOSHOLU PARKWAY","MOSHOLU PKWY"],
                    ["MYRTLE-WILLOUGHBY AVS","MYRTLE-WILLOUGH"],["MYRTLE AV","MYRTLE-WYCKOFF"],
                    ["NEPTUNE AV-VAN SICLEN","NEPTUNE AV"],["238 ST-NEREID AV","NEREID AV"],
                    ["ROCKAWAY PARK-BEACH 116","ROCKAWAY PARK B"],["ROOSEVELT ISLAND","ROOSEVELT ISLND"],
                    ["SENECA AV","SENECA AVE"],["SUTPHIN BLVD-ARCHER AV - JFK","SUTPHIN-ARCHER"],
                    ["SUTTER AV","SUTTER AV-RUTLD"],["VAN CORTLANDT PARK-242 ST","V.CORTLANDT PK"],
                    ["VAN SICLEN AV","VAN SICLEN AVE"], ["AV H",'AVENUE H'], ["AV I",'AVENUE I'], ["AV J",'AVENUE J'],
                    ["AV M",'AVENUE M'], ["AV N",'AVENUE N'], ["AV P",'AVENUE P'], ["AV U",'AVENUE U'], 
                    ["AV X",'AVENUE X']]
    
       
    for x in replacements:
        temp_name = temp_name.replace(x[0], x[1])
    return temp_name

In [373]:
zips["STATION"] = zips["STATION"].apply(fix_names)

In [374]:
zipped_station = pd.merge(turnstile_df[[ 'UNIT','DIVISION', 'STATION', 'DATE', 'TIME','DATETIME', 'DAY', 
                                        'ENTRIES_DELTA', 'EXITS_DELTA']], zips[["DIVISION","STATION","zip"]],
                          on=["DIVISION","STATION"], how="left")


In [477]:
manual_zip_df = pd.read_csv("name_zip_mapping.csv")

In [478]:
def fix_manual_names(cols):
    station_name= cols[0]
    zipcode=cols[1]
    if np.isnan(zipcode):
        values = manual_zip_df[manual_zip_df["name"]==station_name]["zip"].values
        if len(values)>0:
            return values[0]
        else:
            return np.nan
    else:
        return zipcode

In [479]:
zipped_station["zip"] = zipped_station[["STATION", "zip"]].apply(fix_manual_names,  axis=1)

In [483]:
read_dictionary = np.load('zipcode_to_income_dict.npy').item()

In [489]:
def add_income(zip):
    if zip in read_dictionary.keys():
        return read_dictionary[zip]

In [490]:
zipped_station["income"] = zipped_station["zip"].apply(add_income)

In [493]:
zipped_station.sample(30)

Unnamed: 0,UNIT,DIVISION,STATION,DATE,TIME,DATETIME,DAY,ENTRIES_DELTA,EXITS_DELTA,zip,income
175794,R304,IRT,RECTOR ST,09/15/2018,20:00:00,2018-09-15 20:00:00,Saturday,47.0,398.0,10006.0,100788.0
160878,R133,IRT,MOSHOLU PKWY,09/19/2018,05:00:00,2018-09-19 05:00:00,Wednesday,29.0,2.0,10467.0,17484.0
120433,R543,PTH,EXCHANGE PLACE,09/20/2018,12:53:11,2018-09-20 12:53:11,Thursday,394.0,12.0,7302.0,
28162,R111,IND,23 ST,09/17/2018,08:00:00,2018-09-17 08:00:00,Monday,82.0,101.0,10010.0,83240.0
27606,R111,IND,23 ST,09/18/2018,00:00:00,2018-09-18 00:00:00,Tuesday,315.0,4.0,10010.0,83240.0
40511,R091,BMT,36 AV,09/16/2018,04:00:00,2018-09-16 04:00:00,Sunday,0.0,0.0,11106.0,33951.0
22568,R174,IND,181 ST,09/21/2018,09:00:00,2018-09-21 09:00:00,Friday,947.0,43.0,10033.0,24421.0
85625,R269,IND,BEDFORD-NOSTRAN,09/20/2018,04:00:00,2018-09-20 04:00:00,Thursday,0.0,0.0,11205.0,27341.0
142293,R217,IND,HOYT-SCHER,09/15/2018,16:00:00,2018-09-15 16:00:00,Saturday,293.0,390.0,11201.0,71656.0
55771,R084,IND,59 ST COLUMBUS,09/18/2018,04:00:00,2018-09-18 04:00:00,Tuesday,68.0,22.0,10023.0,111473.0


## Analysis:

In [None]:
# Print mean number of entries/exits by day
turnstile_df.groupby(['DAY'])['ENTRIES_DELTA', 'EXITS_DELTA'].mean()

In [None]:
# Print mean number of entries/exits by hour
turnstile_df.groupby(turnstile_df['DATETIME'].dt.hour)['ENTRIES_DELTA', 'EXITS_DELTA'].mean()