In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
from matplotlib.ticker import ScalarFormatter

%config InlineBackend.figure_format = 'svg'
%matplotlib inline 

# Download and load MTA turnstile data

In [2]:
def all_saturdays(start_date, end_date):
    """ 
    Takes two dates and returns a list of saturdays between the first date and a week after the second date
  
    Parameters: 
    start_date (str): date formatted in M/D/YYYY
    end_date (str): date in M/D/YYYY
  
    Returns: 
    saturday_list (list): a list of Saturday dates
    """

    modified_end_date = datetime.datetime.strptime(end_date,'%m/%d/%Y')+ datetime.timedelta(weeks = 1)
    saturday_list = pd.date_range(start =start_date, end =modified_end_date,
                              freq='W-SAT').strftime('%m/%d/%Y').tolist()
    return saturday_list

In [3]:
def import_mta(date):
    """reads in MTA turnstile data published online for a given date"""
    formatted_date = date[-2:]+date[:2]+date[3:5]
    base_url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    date_url = base_url.format(formatted_date)
    date_data = pd.read_csv(date_url)
    return date_data

In [4]:
def mta_to_df(saturday_list):
    """imports MTA turnstile data for a list of dates and then concatenates them"""
    dict_of_dfs = {}
    for saturday in saturday_list:
        dict_of_dfs[saturday] = pd.DataFrame(import_mta(saturday))
    mta =pd.DataFrame()
    for val in dict_of_dfs.values():
        mta = pd.concat([mta, val])
    return mta

In [5]:
#import MTA data from the files published on Saturdays that will contain start_date-end_date
start_date ='3/1/2019'
end_date = '6/1/2019'
saturday_list = all_saturdays(start_date,end_date)
mta = mta_to_df(saturday_list)

In [None]:
mta.tail()

# In case of later breakage

In [None]:
#save imported data to a csv
mta.to_csv(r'mta.csv')

In [10]:
#keep commented unless you need to reload data (avoids having to pull everything from the web again)
# mta = pd.read_csv('mta.csv')

# Cleaning

## column names

In [11]:
mta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3062159 entries, 0 to 3062158
Data columns (total 12 columns):
Unnamed: 0                                                              int64
C/A                                                                     object
UNIT                                                                    object
SCP                                                                     object
STATION                                                                 object
LINENAME                                                                object
DIVISION                                                                object
DATE                                                                    object
TIME                                                                    object
DESC                                                                    object
ENTRIES                                                                 int64
EXITS               

In [12]:
mta.columns

Index(['Unnamed: 0', 'C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION',
       'DATE', 'TIME', 'DESC', 'ENTRIES',
       'EXITS                                                               '],
      dtype='object')

In [13]:
#cleans up column names
#takes leading and trailing spaces out of column names, makes them all lowercase, and replaces "/" with "_"
mta.columns = mta.columns.str.strip().str.lower().str.replace('/',"_")

In [14]:
mta.columns

Index(['unnamed: 0', 'c_a', 'unit', 'scp', 'station', 'linename', 'division',
       'date', 'time', 'desc', 'entries', 'exits'],
      dtype='object')

## create datetime column from date and time

In [15]:
mta["datetime"] = pd.to_datetime(mta.date + " " + mta.time, format="%m/%d/%Y %H:%M:%S")

In [16]:
mta.head()

Unnamed: 0,unnamed: 0,c_a,unit,scp,station,linename,division,date,time,desc,entries,exits,datetime
0,0,A002,R051,02-00-00,59 ST,NQR456W,BMT,02/23/2019,03:00:00,REGULAR,6955483,2359112,2019-02-23 03:00:00
1,1,A002,R051,02-00-00,59 ST,NQR456W,BMT,02/23/2019,07:00:00,REGULAR,6955494,2359125,2019-02-23 07:00:00
2,2,A002,R051,02-00-00,59 ST,NQR456W,BMT,02/23/2019,11:00:00,REGULAR,6955554,2359199,2019-02-23 11:00:00
3,3,A002,R051,02-00-00,59 ST,NQR456W,BMT,02/23/2019,15:00:00,REGULAR,6955714,2359248,2019-02-23 15:00:00
4,4,A002,R051,02-00-00,59 ST,NQR456W,BMT,02/23/2019,19:00:00,REGULAR,6956004,2359292,2019-02-23 19:00:00


In [22]:
mta.drop(columns = ['time'], inplace = True)

## filter to previously specified dates

In [17]:
mta.date.value_counts().sort_index()

02/23/2019    28827
02/24/2019    29109
02/25/2019    28842
02/26/2019    29777
02/27/2019    28903
              ...  
06/03/2019    30022
06/04/2019    29513
06/05/2019    29484
06/06/2019    29213
06/07/2019    28745
Name: date, Length: 105, dtype: int64

In [18]:
mta.date = pd.to_datetime(mta.date,format="%m/%d/%Y")

In [19]:
#keep rows from start_date to end_date
mta = mta[(mta.date >= start_date)]
mta = mta[(mta.date <= end_date)]

In [21]:
#check the above filtering
mta.date.value_counts().sort_index()

2019-03-01    28691
2019-03-02    29234
2019-03-03    28836
2019-03-04    29067
2019-03-05    29234
              ...  
2019-05-28    28873
2019-05-29    29095
2019-05-30    29190
2019-05-31    29651
2019-06-01    29066
Name: date, Length: 93, dtype: int64

In [23]:
mta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2711716 entries, 36 to 3062131
Data columns (total 12 columns):
unnamed: 0    int64
c_a           object
unit          object
scp           object
station       object
linename      object
division      object
date          datetime64[ns]
desc          object
entries       int64
exits         int64
datetime      datetime64[ns]
dtypes: datetime64[ns](2), int64(3), object(7)
memory usage: 269.0+ MB


In [24]:
mta.head(10)

Unnamed: 0,unnamed: 0,c_a,unit,scp,station,linename,division,date,desc,entries,exits,datetime
36,36,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6962094,2361672,2019-03-01 03:00:00
37,37,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6962111,2361714,2019-03-01 07:00:00
38,38,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6962243,2362017,2019-03-01 11:00:00
39,39,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6962502,2362082,2019-03-01 15:00:00
40,40,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6963303,2362149,2019-03-01 19:00:00
41,41,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6963578,2362196,2019-03-01 23:00:00
78,78,A002,R051,02-00-01,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6209663,1392692,2019-03-01 03:00:00
79,79,A002,R051,02-00-01,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6209673,1392711,2019-03-01 07:00:00
80,80,A002,R051,02-00-01,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6209797,1392838,2019-03-01 11:00:00
81,81,A002,R051,02-00-01,59 ST,NQR456W,BMT,2019-03-01,REGULAR,6209986,1392887,2019-03-01 15:00:00


## duplicates

In [25]:
#check for duplicates
(mta
 .groupby(["c_a", "unit", "scp", "station", "datetime"])
 .entries.count()
 .reset_index()
 .sort_values("entries", ascending=False)).head(50)

Unnamed: 0,c_a,unit,scp,station,datetime,entries
716178,N071,R013,00-06-00,34 ST-PENN STA,2019-04-20 08:00:00,2
2215449,R283,R221,00-00-01,167 ST,2019-04-16 04:00:00,2
1325112,N519,R461,00-03-01,B'WAY-LAFAYETTE,2019-03-26 13:00:00,2
1325111,N519,R461,00-03-01,B'WAY-LAFAYETTE,2019-03-26 09:00:00,2
535277,JFK03,R536,00-00-02,JFK JAMAICA CT1,2019-04-07 01:00:00,2
637616,N045,R187,01-00-00,81 ST-MUSEUM,2019-06-01 17:00:00,2
1325110,N519,R461,00-03-01,B'WAY-LAFAYETTE,2019-03-26 05:00:00,2
1325108,N519,R461,00-03-01,B'WAY-LAFAYETTE,2019-03-25 21:00:00,2
1325107,N519,R461,00-03-01,B'WAY-LAFAYETTE,2019-03-25 17:00:00,2
895018,N135,R385,01-03-01,ROCKAWAY BLVD,2019-03-31 13:00:00,2


In [26]:
# look at a specific duplicate
# mask = ((mta["c_a"] == "G009") & 
#     (mta["unit"] == "R151") & 
#     (mta["scp"] == "02-00-04") & 
#     (mta["station"] == "CONEY IS-STILLW") &
#     (mta["datetime"] == "2019-05-16 17:00:00"))
# mta[mask].head(10)

In [27]:
#drop one row from each pair of duplicate rows
mta.sort_values(["c_a", "unit", "scp", "station", "datetime"], inplace=True, ascending=False)
mta.drop_duplicates(subset=["c_a", "unit", "scp", "station", "datetime"], inplace=True)

In [None]:
#check the dropping
(mta
 .groupby(["c_a", "unit", "scp", "station", "datetime"])
 .entries.count()
 .reset_index()
 .sort_values("entries", ascending=False)).head(50)

In [29]:
mta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2711648 entries, 3062131 to 36
Data columns (total 12 columns):
unnamed: 0    int64
c_a           object
unit          object
scp           object
station       object
linename      object
division      object
date          datetime64[ns]
desc          object
entries       int64
exits         int64
datetime      datetime64[ns]
dtypes: datetime64[ns](2), int64(3), object(7)
memory usage: 268.9+ MB


# Subway Locations

In [5]:
#read in data that has location information about the stations
locations = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')

In [6]:
stored_for_errors = locations.copy()

In [7]:
locations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 13 columns):
Station ID               496 non-null int64
Complex ID               496 non-null int64
GTFS Stop ID             496 non-null object
Division                 496 non-null object
Line                     496 non-null object
Stop Name                496 non-null object
Borough                  496 non-null object
Daytime Routes           496 non-null object
Structure                496 non-null object
GTFS Latitude            496 non-null float64
GTFS Longitude           496 non-null float64
North Direction Label    478 non-null object
South Direction Label    481 non-null object
dtypes: float64(2), int64(2), object(9)
memory usage: 50.5+ KB


In [8]:
#cleans up column names
#takes leading and trailing spaces out of column names, makes them all lowercase, 
#and replaces spaces between words and "/" with "_"
locations.columns = locations.columns.str.strip().str.lower().str.replace('/',"_").str.replace(' ', '_')

In [9]:
#the locations.stop_name column corresponds to the mta.station column
#want them to be similarly formatted prior to merging
locations['stop_name'] = locations.stop_name.str.upper().str.strip()
locations

Unnamed: 0,station_id,complex_id,gtfs_stop_id,division,line,stop_name,borough,daytime_routes,structure,gtfs_latitude,gtfs_longitude,north_direction_label,south_direction_label
0,1,1,R01,BMT,Astoria,ASTORIA - DITMARS BLVD,Q,N W,Elevated,40.775036,-73.912034,,Manhattan
1,2,2,R03,BMT,Astoria,ASTORIA BLVD,Q,N W,Elevated,40.770258,-73.917843,Ditmars Blvd,Manhattan
2,3,3,R04,BMT,Astoria,30 AV,Q,N W,Elevated,40.766779,-73.921479,Astoria - Ditmars Blvd,Manhattan
3,4,4,R05,BMT,Astoria,BROADWAY,Q,N W,Elevated,40.761820,-73.925508,Astoria - Ditmars Blvd,Manhattan
4,5,5,R06,BMT,Astoria,36 AV,Q,N W,Elevated,40.756804,-73.929575,Astoria - Ditmars Blvd,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,517,517,S15,SIR,Staten Island,PRINCE'S BAY,SI,SIR,Open Cut,40.525507,-74.200064,St George,Tottenville
492,518,518,S14,SIR,Staten Island,PLEASANT PLAINS,SI,SIR,Embankment,40.522410,-74.217847,St George,Tottenville
493,519,519,S13,SIR,Staten Island,RICHMOND VALLEY,SI,SIR,Open Cut,40.519631,-74.229141,St George,Tottenville
494,522,522,S09,SIR,Staten Island,TOTTENVILLE,SI,SIR,At Grade,40.512764,-74.251961,St George,


In [239]:
sorted(mta.station.unique())

['1 AV',
 '103 ST',
 '103 ST-CORONA',
 '104 ST',
 '110 ST',
 '111 ST',
 '116 ST',
 '116 ST-COLUMBIA',
 '121 ST',
 '125 ST',
 '135 ST',
 '137 ST CITY COL',
 '138/GRAND CONC',
 '14 ST',
 '14 ST-UNION SQ',
 '145 ST',
 '149/GRAND CONC',
 '14TH STREET',
 '15 ST-PROSPECT',
 '155 ST',
 '157 ST',
 '161/YANKEE STAD',
 '163 ST-AMSTERDM',
 '167 ST',
 '168 ST',
 '169 ST',
 '170 ST',
 '174 ST',
 '174-175 STS',
 '175 ST',
 '176 ST',
 '18 AV',
 '18 ST',
 '181 ST',
 '182-183 STS',
 '183 ST',
 '190 ST',
 '191 ST',
 '2 AV',
 '20 AV',
 '207 ST',
 '21 ST',
 '21 ST-QNSBRIDGE',
 '215 ST',
 '219 ST',
 '225 ST',
 '23 ST',
 '231 ST',
 '233 ST',
 '238 ST',
 '25 AV',
 '25 ST',
 '28 ST',
 '3 AV',
 '3 AV 138 ST',
 '3 AV-149 ST',
 '30 AV',
 '33 ST',
 '33 ST-RAWSON ST',
 '34 ST-HERALD SQ',
 '34 ST-HUDSON YD',
 '34 ST-PENN STA',
 '36 AV',
 '36 ST',
 '39 AV',
 '4 AV-9 ST',
 '40 ST LOWERY ST',
 '42 ST-BRYANT PK',
 '42 ST-PORT AUTH',
 '45 ST',
 '46 ST',
 '46 ST BLISS ST',
 '47-50 STS ROCK',
 '49 ST',
 '5 AV/53 ST',
 '5 

In [240]:
sorted(locations.stop_name.unique())

In [241]:
#cleaned up unique values that were similar in the mta df
mta['station'].replace({"4AV-9 ST":"4 AV-9 ST",
                       'TWENTY THIRD ST':'23 ST',
                       'THIRTY THIRD ST':'33 ST'},
                                        inplace = True)

In [242]:
#replace the string naming of location.stop_name to match that of mta.station (patterns)
locations['stop_name'].replace({" - ":"-",
                                "CENTER":"CTR",
                                "SQ-E TREMONT AV":"SQ",
                                " UNIVERSITY":"",
                                "PLAZA":"PZ",
                                "COLLEGE":"COL",
                                "STATION":"STA",
                                     },
                                         regex=True, inplace=True)

In [247]:
#replace the string naming of location.stop_name to match that of mta.station (specific stations)
locations['stop_name'].replace({"103 ST-CORONA PZ":"103 ST-CORONA",
                                "137 ST-CITY COL":"137 ST CITY COL",
                                "138 ST-GRAND CONCOURSE":"138/GRAND CONC",
                                "149 ST-GRAND CONCOURSE":"149/GRAND CONC",
                                "15 ST-PROSPECT PK":"15 ST-PROSPECT",
                                "161 ST-YANKEE STADIUM":"161/YANKEE STAD",
                                "163 ST-AMSTERDAM AV":"163 ST-AMSTERDM",
                                "21 ST-QUEENSBRIDGE":"21 ST-QNSBRIDGE",
                                "3 AV-138 ST":"3 AV 138 ST",
                                "40 ST":"40 ST LOWERY ST",
                                "42 ST-PORT AUTHORITY BUS TERMINAL":"42 ST-PORT AUTH",
                                "5 AV":"5 AVE",
                                "59 ST-COLUMBUS CIRCLE":"59 ST COLUMBUS",
                                "66 ST-LINCOLN CTR":"66 ST-LINCOLN",
                                "68 ST-HUNTER COL":"68ST-HUNTER CO",
                                "75 ST":"75 ST-ELDERTS",
                                "81 ST-MUSEUM OF NATURAL HISTORY":"81 ST-MUSEUM",
                                "82 ST-JACKSON HTS":"82 ST-JACKSON H",
                                "85 ST-FOREST PKWY":"85 ST-FOREST PK",
                                "90 ST-ELMHURST AV":"90 ST-ELMHURST",
                                "9 ST":"9TH STREET",
                                "AQUEDUCT-N CONDUIT AV":"AQUEDUCT N.COND",
                                "AQUEDUCT RACETRACK":"AQUEDUCT RACETR",
                                "ASTORIA-DITMARS BLVD":"ASTORIA DITMARS",
                                 'ATLANTIC AV-BARCLAYS CTR':'ATL AV-BARCLAY',
                                 'BEDFORD-NOSTRAND AVS':'BEDFORD-NOSTRAN',
                                 'BEVERLEY RD':'BEVERLEY ROAD',
                                 'BRIARWOOD-VAN WYCK BLVD':'BRIARWOOD',
                                 'BROADWAY-LAFAYETTE ST':"B'WAY-LAFAYETTE",
                                 '15 ST-PROSPECT PARK':'15 ST-PROSPECT',
                                 '47-50 STS CTR':'47-50 STS ROCK',
                                 'BEDFORD PARK BLVD':'BEDFORD PK BLVD',
                                 'BROOKLYN BRIDGE-CITY HALL':'BROOKLYN BRIDGE',
                                 'BUSHWICK AV-ABERDEEN ST':'BUSHWICK AV',
                                 'CANARSIE-ROCKAWAY PKWY':'CANARSIE-ROCKAW',
                                 'CENTRAL PARK NORTH (110 ST)':'CENTRAL PK N110',
                                 'CHRISTOPHER ST-SHERIDAN SQ':'CHRISTOPHER ST',
                                 'CLINTON-WASHINGTON AVS':'CLINTON-WASH AV',
                                 'CONEY ISLAND-STILLWELL AV':'CONEY IS-STILLW',
                                 'COURT ST':'COURT SQ-23 ST',
                                 'CROWN HTS-UTICA AV':'CROWN HTS-UTICA',
                                 'DELANCEY ST':'CROWN HTS-UTICA',
                                 'E 105 ST':'EAST 105 ST',
                                 "E 143 ST-ST MARY'S ST":"E 143/ST MARY'S",
                                 'EASTCHESTER-DYRE AV':'EASTCHSTER/DYRE',
                                 'EASTERN PKWY-BROOKLYN MUSEUM':'EASTN PKWY-MUSM',
                                 'FAR ROCKAWAY-MOTT AV': 'FAR ROCKAWAY',
                                 'FLATBUSH AV-BROOKLYN COL':'FLATBUSH AV-B.C',
                                 'FLUSHING-MAIN ST':'FLUSHING-MAIN',
                                 'FOREST AV':'FOREST AVE',
                                 'FOREST HILLS-71 AV':'FOREST HILLS 71',
                                 'FORT HAMILTON PKWY': 'FT HAMILTON PKY',
                                 'GRAND ARMY PZ':'GRAND ARMY PLAZ',
                                 'GRAND AV-NEWTOWN':'GRAND-NEWTOWN',
                                 'GRAND CENTRAL-42 ST':'GRD CNTRL-42 ST',
                                 'HARLEM-148 ST':'HARLEM 148 ST',
                                 'HOWARD BEACH-JFK AIRPORT':'HOWARD BCH JFK',
                                 'HOYT-SCHERMERHORN STS':'HOYT-SCHER',
                                 'HUNTERS POINT AV':'HUNTERS PT AV',
                                 'JAMAICA CTR-PARSONS/ARCHER':'JAMAICA CENTER',
                                 'JAMAICA-179 ST':'JAMAICA 179 ST',
                                 'JAMAICA-VAN WYCK':'JAMAICA VAN WK',
                                 'JAY ST-METROTECH':'JAY ST-METROTEC',
                                 'KEW GARDENS-UNION TPKE':'KEW GARDENS',
                                 'KINGSTON-THROOP AVS':'KINGSTON-THROOP',
                                 'KNICKERBOCKER AV':'KNICKERBOCKER',
                                 'LEXINGTON AV/53 ST':'LEXINGTON AV/53',
                                 'LEXINGTON AV/63 ST':'LEXINGTON AV/63',
                                 'MARBLE HILL-225 ST':'MARBLE HILL-225',
                                 'METS-WILLETS POINT':'METS-WILLETS PT',
                                 'MORRISON AV- SOUND VIEW':'MORISN AV/SNDVW',
                                 'MYRTLE-WILLOUGHBY AVS': 'MYRTLE-WILLOUGH',
                                 'MYRTLE-WYCKOFF AVS':'MYRTLE-WYCKOFF',
                                 'NORWOOD-205 ST': 'NORWOOD 205 ST',
                                 'OZONE PARK-LEFFERTS BLVD':'OZONE PK LEFFRT',
                                 'PARK PL': 'PARK PLACE',
                                 'QUEENS PZ':'QUEENS PLAZA',
                                 'ROCKAWAY PARK-BEACH 116 ST':'ROCKAWAY PARK B',
                                 'ROOSEVELT ISLAND':'ROOSEVELT ISLND',
                                 'SENECA AV':'SENECA AVE',
                                 'SMITH-9 STS':'SMITH-9 ST',
                                 'ST GEORGE':'ST. GEORGE',
                                 'VAN CORTLANDT PARK-242 ST':'V.CORTLANDT PK',
                                 'VERNON BLVD-JACKSON AV':'VERNON-JACKSON',
                                 'W 4 ST':'W 4 ST-WASH SQ',
                                 'W 8 ST-NY AQUARIUM':'W 8 ST-AQUARIUM',
                                 'WAKEFIELD-241 ST':'WAKEFIELD/241',
                                 'WTC CORTLANDT':'WTC-CORTLANDT',
                                 '4 AV':'4 AV-9 ST',
                                 'ESSEX ST':'DELANCEY/ESSEX',
                                 'JACKSON HTS-ROOSEVELT AV':'JKSN HT-ROOSVLT',
                                 'NEWKIRK PZ':'NEWKIRK PLAZA',
                                 'QUEENSBORO PZ':'QUEENSBORO PLZ',
                                 'SUTPHIN BLVD-ARCHER AV-JFK AIRPORT':'SUTPHIN-ARCHER',
                                 'SUTTER AV-RUTLAND RD':'SUTTER AV-RUTLD',
                                 'UNION SQ-14 ST':'14TH STREET',
                                 'WHITEHALL ST':'WHITEHALL S-FRY',
                                 'WOODSIDE-61 ST':'61 ST WOODSIDE',
                                '34 ST-11 AV':'34 ST-HUDSON YD',
                                'JAMAICA CTR':'JAMAICA CENTER',
                                '47-50 STS-ROCKEFELLER CTR':'47-50 STS ROCK',
                                'WEST FARMS SQ-E TREMONT AV':'WEST FARMS SQ',
                                 'WESTCHESTER SQ-E TREMONT AV':'WESTCHESTER SQ'
                               },
                                        inplace = True)

In [248]:
#check for naming differences in the values of mta.station and locations.stop_name

# cols_in_both = [val for val in mta.station if val in locations.stop_name]
# cols_in_only_mta = [val for val in mta.station.unique() if val not in locations.stop_name.unique()]
# cols_in_only_locations = [val for val in locations.stop_name.unique() if val not in mta.station.unique()]

In [249]:
#look values unique to mta.station and compare with values unique to locations.stop_name below
# sorted(cols_in_only_mta)

['33 ST-RAWSON ST',
 '46 ST BLISS ST',
 '72 ST-2 AVE',
 '86 ST-2 AVE',
 '96 ST-2 AVE',
 'CITY / BUS',
 'EXCHANGE PLACE',
 'GROVE STREET',
 'HARRISON',
 'JFK JAMAICA CT1',
 'JOURNAL SQUARE',
 'LACKAWANNA',
 'NEW LOTS',
 'NEWARK BM BW',
 'NEWARK C',
 'NEWARK HM HE',
 'NEWARK HW BMEBE',
 'ORCHARD BEACH',
 'PATH NEW WTC',
 'PATH WTC 2',
 'PAVONIA/NEWPORT',
 'RIT-MANHATTAN',
 'RIT-ROOSEVELT',
 'THIRTY ST',
 'VAN SICLEN AVE']

In [250]:
# sorted(cols_in_only_locations)

['168 ST-WASHINGTON HTS',
 '62 ST',
 'ANNADALE',
 'ARTHUR KILL',
 'BAY TERRACE',
 'BEDFORD PARK BLVD-LEHMAN COL',
 'CATHEDRAL PKWY (110 ST)',
 'CLIFTON',
 'DONGAN HILLS',
 'ELTINGVILLE',
 'GRANT CITY',
 'GRASMERE',
 'GREAT KILLS',
 'HUGUENOT',
 'JEFFERSON AV',
 'LEXINGTON AV/59 ST',
 'MIDDLE VILLAGE-METROPOLITAN AV',
 'NEW DORP',
 'OAKWOOD HEIGHTS',
 'OLD TOWN',
 'PLEASANT PLAINS',
 "PRINCE'S BAY",
 'RICHMOND VALLEY',
 'STAPLETON',
 'TOTTENVILLE']

In [271]:
#merge mta and locations on station/stop_name and division 
mta_locations = mta.merge(locations, left_on=['station, division'], right_on='stop_name, division', suffixes=('_left', '_right'))

KeyError: 'stop_name, division'

In [263]:
mta_locations.head()

Unnamed: 0,unnamed: 0,c_a,unit,scp,station,linename,division_left,date,desc,entries,...,division_right,line,stop_name,borough,daytime_routes,structure,gtfs_latitude,gtfs_longitude,north_direction_label,south_direction_label
0,204606,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
1,204605,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
2,204604,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
3,204603,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
4,204602,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville


In [264]:
mta_locations.borough.value_counts()

M     2651942
Bk    1207452
Q      506453
Bx     319984
SI      18782
Name: borough, dtype: int64

In [265]:
mta_locations

Unnamed: 0,unnamed: 0,c_a,unit,scp,station,linename,division_left,date,desc,entries,...,division_right,line,stop_name,borough,daytime_routes,structure,gtfs_latitude,gtfs_longitude,north_direction_label,south_direction_label
0,204606,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
1,204605,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
2,204604,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
3,204603,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
4,204602,S102,R165,00-05-01,TOMPKINSVILLE,1,SRT,2019-06-01,REGULAR,369,...,SIR,Staten Island,TOMPKINSVILLE,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4704608,544,A006,R079,00-00-00,5 AV/59 ST,NQRW,BMT,2019-03-01,REGULAR,3757391,...,BMT,Astoria,5 AV/59 ST,M,N W R,Subway,40.764811,-73.973347,Queens,Downtown & Brooklyn
4704609,543,A006,R079,00-00-00,5 AV/59 ST,NQRW,BMT,2019-03-01,REGULAR,3756933,...,BMT,Astoria,5 AV/59 ST,M,N W R,Subway,40.764811,-73.973347,Queens,Downtown & Brooklyn
4704610,542,A006,R079,00-00-00,5 AV/59 ST,NQRW,BMT,2019-03-01,REGULAR,3756798,...,BMT,Astoria,5 AV/59 ST,M,N W R,Subway,40.764811,-73.973347,Queens,Downtown & Brooklyn
4704611,541,A006,R079,00-00-00,5 AV/59 ST,NQRW,BMT,2019-03-01,REGULAR,3756762,...,BMT,Astoria,5 AV/59 ST,M,N W R,Subway,40.764811,-73.973347,Queens,Downtown & Brooklyn


# daily entries by turnstile

In [None]:
mta.head(10)

In [None]:
by_turnstile_date = mta.groupby(['c_a', 'unit', 'scp', 'station', mta.datetime.dt.date])

In [None]:
mta_daily = by_turnstile_date['entries'].min().reset_index()

In [None]:
mta_daily.describe([.95,.997])

In [None]:
mta_daily['daily_entries'] = mta_daily.groupby(['c_a', 'unit', 'scp', 'station'])['entries'].diff().shift(-1)

In [None]:
#dropping rows with negative daily entries
mta_daily = mta_daily[mta_daily.daily_entries >= 0]

In [None]:
#dropping rows with the top .3% of daily entries
mta_daily = mta_daily[mta_daily.daily_entries < mta_daily.daily_entries.quantile(q =.997)]

In [None]:
mta_daily

In [None]:
mta_daily.describe()

# Determining Which Stations Have The Highest Traffic

In [None]:
mta_daily['datetime'] = pd.to_datetime(mta_daily.datetime,format="%Y-%m-%d")

In [None]:
grouped_by_station_and_week = mta_daily.groupby(['station',mta_daily.datetime.dt.week])['daily_entries'].agg([np.sum]).reset_index()
grouped_by_station_and_week = grouped_by_station_and_week.rename(columns={'datetime': 'week', 'sum':'weekly_entries'})

In [None]:
plt.figure(figsize=(16,4))

unique_stations = grouped_by_station_and_week.station.unique()
for sta in unique_stations:
    single_station = grouped_by_station_and_week[grouped_by_station_and_week.station == sta].groupby(["week"])['weekly_entries'].agg(['max'])
    plt.plot(single_station[1:]);

In [None]:
grouped_by_station_and_day = mta_daily.groupby(['station', mta_daily.datetime.dt.date])['daily_entries'].agg([np.sum])
grouped_by_station_and_day = grouped_by_station_and_day.reset_index().rename(columns={'sum': 'daily_entries', 'datetime': 'date'})

plt.figure(figsize=(16,4))

for sta in unique_stations:
    single_station = grouped_by_station_and_day[grouped_by_station_and_day.station == sta].groupby(["date"])['daily_entries'].agg(['max'])
    plt.plot(single_station[single_station.index >= pd.to_datetime('2019-05-01').date()]);

In [None]:
grp_sta_wk_mean = grouped_by_station_and_week.groupby('station')['weekly_entries'].agg(['mean'])
grp_sta_wk_mean = grp_sta_wk_mean.sort_values(by='mean', ascending=False)[:10]
grp_sta_wk_mean

In [None]:
grp_sta_day_mean = grouped_by_station_and_day.groupby('station')['daily_entries'].agg(['mean'])
grp_sta_day_mean = grp_sta_day_mean.sort_values(by='mean', ascending=False)[:10]
grp_sta_day_mean

In [None]:
top_ten_stations = grp_sta_wk_mean.index.unique()

plt.figure(figsize=(12,4))

for sta in top_ten_stations:
    single_station = grouped_by_station_and_week[grouped_by_station_and_week.station == sta]#.groupby(["week"])['weekly_entries'].agg(['max'])
    time_plot = sns.lineplot(x=single_station.week[1:-1], y = single_station.weekly_entries[1:-1], label=sta);
    #plt.plot(single_station[1:-1], label=sta)
    
plt.legend(loc=1);
time_plot.legend(loc=1, fontsize='10');
time_plot.set_title('High Traffic Stations By Weekly Entries', color=brand_blue)
time_plot.set_ylabel('Weekly Entries', color=brand_blue)
time_plot.set_xlabel('Week of the Year', color=brand_blue);
sf=ScalarFormatter()
sf.set_scientific(True)
time_plot.yaxis.set_major_formatter(sf)
sns.despine()
sns.set_style('white')
sns.set_palette("Set2");

In [None]:
plt.figure(figsize=(12,4))
brand_blue = '#042263FF'
for sta in top_ten_stations:
    single_station = grouped_by_station_and_day[grouped_by_station_and_day.station == sta]#.groupby(["date"])['daily_entries'].agg(['max'])
    ss_boundary = single_station[single_station.date >= pd.to_datetime('2019-05-01').date()]
    time_plot = sns.lineplot(x=ss_boundary.date, y = ss_boundary.daily_entries, label=sta);
    
time_plot.legend(loc=1, fontsize='10');
time_plot.set_title('High Traffic Stations By Daily Entries', color=brand_blue)
time_plot.set_ylabel('Daily Entries', color=brand_blue)
time_plot.set_xlabel('Date', color=brand_blue);
sns.despine()
sns.set_style('white')
sns.set_palette("Set2");

**NEED TO SET Y AXIS ON ABOVE TO SCIENTIFIC**