# Problem Statement:

Where should WomenTechWomenYes (WTWY) locate their street teams to collect signatures and promote a gala on October 1st?

We propose to use the most recent week of MTA turnstyle data to locate subway stations with the greatest flows of people.  We will analyze this data to identify and map the best locations and times to dispatch a finite number of street teams.

More specifically, in a engineering statement, using the last four weeks of MTA turnstile data, we should be able to identify and plot the subway stations in New York with the highest flow of people for every 4 hour window in the week.

### Data Format
STATION | DAY | TIME | PEOPLE

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import collections
import numpy as np
import datetime
%matplotlib inline

In [2]:
#import from my challenge 9, for all data

#create a concated dataframe with all week's data
df1 = pd.read_csv('turnstile_180901.txt')
df2 = pd.read_csv('turnstile_180908.txt')
df3 = pd.read_csv('turnstile_180915.txt')
df4 = pd.read_csv('turnstile_180922.txt')

df_all = pd.concat([df1, df2, df3, df4])

#df1.info()
#df_all.info()


In [3]:
#currently considering only df4, use df_all for all data (September)
df=df4 

#rename the EXITS column, which has trailing whitespace
df.columns = [column.strip() for column in df.columns]
df["DATE_TIME"] = pd.to_datetime(df.DATE + " " + df.TIME, format="%m/%d/%Y %H:%M:%S")
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,00:00:00,REGULAR,6759219,2291425,2018-09-15 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,04:00:00,REGULAR,6759234,2291429,2018-09-15 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,08:00:00,REGULAR,6759251,2291453,2018-09-15 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,12:00:00,REGULAR,6759330,2291532,2018-09-15 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,16:00:00,REGULAR,6759538,2291574,2018-09-15 16:00:00


In [4]:
import collections
#THIS CODE SHOWS THAT MANY OF THE TURNSTILES COUNT REPORTS WERE RECORDED AT DIFFERNT TIMES (NOT 00:00:00)
#counted_times =  collections.Counter(df['TIME'])
#ct = []
#for k,v in counted_times.items() :
#       ct.append((k, v))
        
#ct[0:100]

#~1/2 the turnstiles are out of sync and need some time rounding. 
#from StackOverFlow

hr_4 = 4*60*60
def roundTime(ts):
    """
    The pd.to_datetime() method creates a Timestamp obj. 
    ['DATE_TIME'][0]  =>  Timestamp('2018-08-25 00:00:00')
    
    This method will take the Timestamp object from a dataframe and create the datetime object, then
    Round a datetime object to any time lapse in seconds
    dt : datetime.datetime object, default now.
    roundTo : Closest number of seconds to round to, default 1 minute.
    Orig. Author: Thierry Husson 2012 - Use it as you want but don't blame me. => modified by TR
    
    It also works on datetime objects. 
    roundTime(datetime.datetime(2012,12,31,23,44,59,1234),roundTo=60*60)
    2013-01-01 00:00:00
    """
    dt = datetime.datetime.strptime(str(ts), '%Y-%m-%d %H:%M:%S')
    
    roundTo=4*60*60
    #if dt == None : dt = datetime.datetime.now()
    seconds = (dt.replace(tzinfo=None) - dt.min).seconds
    rounding = (seconds+roundTo/2) // roundTo * roundTo
    return dt + datetime.timedelta(0,rounding-seconds,-dt.microsecond)      

#IMPORTAT SANITY CHECKS
#str(df['DATE_TIME'][0])
#datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S')
#.strptime(date_str, format_str)
#datetime.datetime.strptime('2018-08-25 12:00:00', '%Y-%m-%d %H:%M:%S')
#roundTime(datetime.datetime(2012,12,31,23,44,59))
#roundTime(datetime.datetime(2012,12,31,0,0,0,0))
#roundTime(datetime.datetime(2012,12,31,0,23,0,0))
#roundTime(datetime.datetime(2012,12,31,1,23,0,0))

#type(df['DATE_TIME'][0])
#df['DATE_TIME'][0]  =>  Timestamp('2018-08-25 00:00:00')
#roundTime(df['DATE_TIME'][0])




In [5]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,00:00:00,REGULAR,6759219,2291425,2018-09-15 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,04:00:00,REGULAR,6759234,2291429,2018-09-15 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,08:00:00,REGULAR,6759251,2291453,2018-09-15 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,12:00:00,REGULAR,6759330,2291532,2018-09-15 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,16:00:00,REGULAR,6759538,2291574,2018-09-15 16:00:00


In [6]:
df['DATE_TIME_R'] = df['DATE_TIME'].apply(roundTime) 


In [7]:
#roundTime works!

df['DATE_TIME_R'].value_counts()

2018-09-18 12:00:00    5373
2018-09-19 08:00:00    5246
2018-09-18 08:00:00    5004
2018-09-16 12:00:00    4956
2018-09-20 08:00:00    4906
2018-09-18 16:00:00    4879
2018-09-19 12:00:00    4748
2018-09-15 20:00:00    4718
2018-09-16 20:00:00    4716
2018-09-18 00:00:00    4708
2018-09-17 04:00:00    4702
2018-09-21 12:00:00    4701
2018-09-21 16:00:00    4697
2018-09-17 00:00:00    4695
2018-09-15 04:00:00    4693
2018-09-19 20:00:00    4691
2018-09-19 16:00:00    4691
2018-09-15 08:00:00    4690
2018-09-15 12:00:00    4688
2018-09-16 00:00:00    4687
2018-09-20 16:00:00    4687
2018-09-15 16:00:00    4685
2018-09-18 04:00:00    4685
2018-09-16 04:00:00    4684
2018-09-18 20:00:00    4684
2018-09-21 08:00:00    4683
2018-09-19 04:00:00    4682
2018-09-19 00:00:00    4681
2018-09-20 04:00:00    4680
2018-09-20 12:00:00    4679
2018-09-20 20:00:00    4679
2018-09-20 00:00:00    4678
2018-09-17 20:00:00    4677
2018-09-16 08:00:00    4675
2018-09-17 08:00:00    4673
2018-09-16 16:00:00 

In [8]:
#df['DATE_TIME_S']=str(df['DATE_TIME_R'])
#df.head()

#a = df['DATE_TIME_R'][0]

In [25]:
df_window = df.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME_R"])[['ENTRIES', 'EXITS']].first().reset_index()
df_window.head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME_R,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,2018-09-15 00:00:00,6759219,2291425
1,A002,R051,02-00-00,59 ST,2018-09-15 04:00:00,6759234,2291429
2,A002,R051,02-00-00,59 ST,2018-09-15 08:00:00,6759251,2291453
3,A002,R051,02-00-00,59 ST,2018-09-15 12:00:00,6759330,2291532
4,A002,R051,02-00-00,59 ST,2018-09-15 16:00:00,6759538,2291574


In [29]:
df_window[["PREV_DATE_TIME_R", "PREV_ENTRIES", "PREV_EXITS"]] = (df_window
                                                                 .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE_TIME_R", "ENTRIES", "EXITS"]
                                                                 .transform(lambda grp: grp.shift(1)))

#df_window[["PREV_DATE_TIME_R", "PREV_ENTRIES", "PREV_EXITS"]] = (df_window
#                                                       .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME_R"])["DATE_TIME_R", "ENTRIES", "EXITS"]
#                                                       .transform(lambda grp: grp.shift(1)))

# Drop the rows for last date
df_window.dropna(subset=["PREV_DATE_TIME_R"], axis=0, inplace=True)

In [30]:
def get_net_entries(row, max_counter):
    counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    if counter < 0:
        # May be counter is reversed?
        counter = -counter
    if counter > max_counter:
        print(row["ENTRIES"], row["PREV_ENTRIES"])
        counter = min(row["ENTRIES"], row["PREV_ENTRIES"])
    if counter > max_counter:
        # Check it again to make sure we are not giving a counter that's too big
        return 0
    return counter

def get_net_exits(row, max_counter):
    counter = row["EXITS"] - row["PREV_EXITS"]
    if counter < 0:
        # May be counter is reversed?
        counter = -counter
    if counter > max_counter:
        print(row["EXITS"], row["PREV_EXITS"])
        counter = min(row["EXITS"], row["PREV_EXITS"])
    if counter > max_counter:
        # Check it again to make sure we are not giving a counter that's too big
        return 0
    return counter

df_window["NET_ENTRIES"] = df_window.apply(get_net_entries, axis=1, max_counter=1000000)
df_window["NET_EXITS"] = df_window.apply(get_net_exits, axis=1, max_counter=1000000)

52 15832680.0
524317 3346426.0
85 1632566.0
1048831 85.0
10054 1137744.0
1138236 10054.0
10109 1138236.0
1139258 10109.0
10181 1139258.0
1139544 10181.0
10188 1139544.0
1139937 10188.0
10248 1139937.0
1140682 10248.0
10290 1140682.0
1140894 10290.0
10340 1140894.0
1143293 10340.0
10673 1143293.0
1144207 10673.0
10731 1144207.0
1144357 10731.0
10802 1144357.0
1146455 10802.0
11103 1146455.0
1147134 11103.0
11154 1147134.0
1147275 11154.0
11289 1147275.0
1149464 11289.0
11588 1149464.0
1150229 11588.0
11632 1150229.0
1150348 11632.0
11841 1150348.0
1152619 11841.0
12072 1152619.0
1153422 12072.0
12108 1153422.0
4 2794719.0
32375149 158391.0
78 3309042.0
83 3159733.0
17 7629078.0
35 4978691.0
1005042 4675.0
4732 1005042.0
1005640 4732.0
4758 1005640.0
1005918 4758.0
4813 1005918.0
1007664 4813.0
4928 1007664.0
1008231 4928.0
4960 1008231.0
1008574 4960.0
5026 1008574.0
1010459 5026.0
5170 1010459.0
2 1475467.0
23462382 273100.0
226 5754872.0


In [31]:
df_window.DATE_TIME_R.value_counts()

2018-09-19 20:00:00    4691
2018-09-16 12:00:00    4688
2018-09-20 08:00:00    4688
2018-09-17 00:00:00    4687
2018-09-16 00:00:00    4687
2018-09-16 04:00:00    4684
2018-09-17 04:00:00    4684
2018-09-15 08:00:00    4684
2018-09-15 20:00:00    4683
2018-09-18 04:00:00    4683
2018-09-18 20:00:00    4682
2018-09-19 04:00:00    4682
2018-09-21 12:00:00    4682
2018-09-19 12:00:00    4680
2018-09-19 16:00:00    4679
2018-09-18 12:00:00    4679
2018-09-20 12:00:00    4679
2018-09-15 16:00:00    4679
2018-09-20 04:00:00    4678
2018-09-20 00:00:00    4678
2018-09-15 12:00:00    4677
2018-09-20 16:00:00    4677
2018-09-21 08:00:00    4677
2018-09-18 00:00:00    4676
2018-09-16 08:00:00    4675
2018-09-18 08:00:00    4675
2018-09-20 20:00:00    4675
2018-09-19 00:00:00    4675
2018-09-17 20:00:00    4674
2018-09-17 08:00:00    4673
2018-09-19 08:00:00    4673
2018-09-21 16:00:00    4673
2018-09-16 16:00:00    4672
2018-09-16 20:00:00    4671
2018-09-18 16:00:00    4670
2018-09-21 00:00:00 

In [62]:


df_out = df_window.groupby(['STATION', 'DATE_TIME_R'])[['NET_ENTRIES', 'NET_EXITS']].sum().reset_index()

df_out["PEOPLE"]=df_out.NET_ENTRIES+df_out.NET_EXITS
df_out.head()

df_out['WEEKDAY']=df_out.DATE_TIME_R.apply(lambda x: x.weekday())
df_out['HOUR']=df_out.DATE_TIME_R.apply(lambda x: x.hour)

df_out.groupby(['STATION','WEEKDAY', 'HOUR']).mean().reset_index()
df_out.head()

Unnamed: 0,STATION,DATE_TIME_R,NET_ENTRIES,NET_EXITS,PEOPLE,WEEKDAY,HOUR
0,1 AV,2018-09-15 04:00:00,1580.0,1233.0,2813.0,5,4
1,1 AV,2018-09-15 08:00:00,507.0,1136.0,1643.0,5,8
2,1 AV,2018-09-15 12:00:00,2402.0,3046.0,5448.0,5,12
3,1 AV,2018-09-15 16:00:00,4496.0,4785.0,9281.0,5,16
4,1 AV,2018-09-15 20:00:00,4927.0,6012.0,10939.0,5,20


Unnamed: 0,STATION,WEEKDAY,HOUR,NET_ENTRIES,NET_EXITS,PEOPLE
0,1 AV,0,0,2358.0,2575.0,4933.0
1,1 AV,0,4,347.0,267.0,614.0
2,1 AV,0,8,1652.0,3308.0,4960.0
3,1 AV,0,12,4820.0,5535.0,10355.0
4,1 AV,0,16,4655.0,3817.0,8472.0
5,1 AV,0,20,6408.0,6542.0,12950.0
6,1 AV,1,0,2743.0,2682.0,5425.0
7,1 AV,1,4,0.0,4.0,4.0
8,1 AV,1,8,1641.0,3460.0,5101.0
9,1 AV,1,12,4993.0,5606.0,10599.0
