### Merge Entry and Exit Data with Tube Coordinates Data

In [1]:
import pandas as pd 
import os

In [2]:
# check file names
files = os.listdir(os.curdir) 
files

['.ipynb_checkpoints',
 'Create Buffer and Convert to GeoJSON',
 'Create Buffer and Convert to GeoJSON.R',
 'FOI-1215.csv',
 'London Tubestations Coordinates.csv',
 'Merge Tube Data and Create List of Unique Unix Timestamps .ipynb',
 'night_unix.csv',
 'TFL Data Cleaning and Unix Timestamp Conversion.ipynb']

#### Read in Tubestations Coordinates data 

In [3]:
df1 = pd.read_csv('London Tubestations Coordinates.csv', sep=';')
df1.head(1)

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode
0,Abbey Road,539081,183352,51.531952,0.003723,3,E15 3NB


In [4]:
# 641 unique stations 
len(df1.Station.unique())

641

#### Read in Exit and Entry data 

In [5]:
df2 = pd.read_csv('night_unix.csv', sep = ',')
df2.head(1)

Unnamed: 0.1,Unnamed: 0,Date,Station,Time,Entries,Exits,unix
0,0,19/8/2016,Bank,2:00:00,137,74,1471568000.0


In [6]:
# 148 unique stations 
len(df2.Station.unique())

148

In [7]:
pd.options.mode.chained_assignment = None

#### Join Datasets 

In [8]:
new_df = pd.merge(df1, df2, on="Station", how='inner')

In [9]:
len(new_df.Station.unique())
# only 122 stations left after inner join 

122

Right join to get a list of station names that are not mering correctly. Afterwards looking at the London Tubestations Coordinates CSV file to check what is wrong. 

In [10]:
new_df = pd.merge(df1, df2, on="Station", how='right')
nan_rows = new_df [new_df .isnull().T.any().T]
nan_rows.Station.unique()

array(['Bethnal Green LU', 'Brixton LU', 'Euston LU', 'Finsbury Park LU',
       'Highbury & Islington', 'Kings Cross LU (North)',
       'Liverpool Street LU', 'Shepherds Bush LU', 'St Pauls',
       'Vauxhall LU', 'Victoria LU', 'Canary Wharf LU (E1)',
       'London Bridge LU', 'St Johns Wood', 'Waterloo LU (Bloo/Nthn/W&C)',
       'West Hampstead LU', 'Balham LU', 'Totteridge',
       'Waterloo LU (Jubilee)', 'Canary Wharf LU (E2)', 'Hammersmith D&P',
       'Heathrow Terminal 4 LU', 'Heathrow Terminal 5 LU',
       'Heathrow Terminals 123 LU', 'Kings Cross LU (Tube)',
       'Kings Cross LU (Western)'], dtype=object)

Change station names

In [11]:
df2['Station'][df2.Station == 'Bethnal Green LU'] = "Bethnal Green"
df2['Station'][df2.Station == 'Brixton LU'] = "Brixton"
df2['Station'][df2.Station == 'Euston LU'] = "Euston"
df2['Station'][df2.Station == 'Finsbury Park LU'] = "Finsbury Park"
df2['Station'][df2.Station == 'Highbury & Islington'] = "Highbury and Islington"
df2['Station'][df2.Station == 'Liverpool Street LU'] = "Liverpool Street"
df2['Station'][df2.Station == 'Shepherds Bush LU'] = "Shepherds Bush"
df2['Station'][df2.Station == 'St Pauls'] = "St. Pauls"
df2['Station'][df2.Station == 'Vauxhall LU'] = "Vauxhall"
df2['Station'][df2.Station == 'Victoria LU'] = "Victoria"
df2['Station'][df2.Station == 'London Bridge LU'] = "London Bridge"
df2['Station'][df2.Station == 'St Johns Wood'] = "St. Johns Wood"
df2['Station'][df2.Station == 'West Hampstead LU'] = "West Hampstead"
df2['Station'][df2.Station == 'Victoria LU'] = "Victoria"
df2['Station'][df2.Station == 'Balham LU'] = "Balham"
df2['Station'][df2.Station == 'Totteridge'] = "Totteridge and Whetstone"
df2['Station'][df2.Station == 'Hammersmith D&P'] = "Hammersmith (District)"
df2['Station'][df2.Station == 'Heathrow Terminal 4 LU'] = "Heathrow Terminal 4"
df2['Station'][df2.Station == 'Heathrow Terminal 5 LU'] = "Heathrow Terminal 5"
df2['Station'][df2.Station == 'Heathrow Terminals 123 LU'] = "Heathrow Terminals 1 2 3"

# mentioned only once in london tubestations coordinates file              
df2['Station'][df2.Station == 'Kings Cross LU (North)'] = "King's Cross"
df2['Station'][df2.Station == 'Kings Cross LU (Tube)'] = "King's Cross"
df2['Station'][df2.Station == 'Kings Cross LU (Western)'] = "King's Cross"
df2['Station'][df2.Station == 'Canary Wharf LU (E1)'] = "Canary Wharf"
df2['Station'][df2.Station == 'Canary Wharf LU (E2)'] = "Canary Wharf"
df2['Station'][df2.Station == 'Waterloo LU (Bloo/Nthn/W&C)'] = "Waterloo"
df2['Station'][df2.Station == 'Waterloo LU (Jubilee)'] = "Waterloo"

Aggregate stations that occur multiple times

In [12]:
df_Exits = df2.groupby(['Station','unix'], as_index=False)['Exits'].sum()

In [13]:
df_Entries = df2.groupby(['Station','unix'], as_index=False)['Entries'].sum()

In [14]:
df_Exits.head()
len(df_Exits)

31510

In [15]:
df_Entries.head()
len(df_Entries)

31510

In [16]:
df_combined = pd.merge(df_Exits, df_Entries, on=["Station", "unix"], how='inner')

In [17]:
df_combined.head()
len(df_combined)

31510

In [18]:
len(df_combined.Station.unique())
# 144 stations 

144

In [19]:
df_combined.head(1)

Unnamed: 0,Station,unix,Exits,Entries
0,Acton Town,1481854000.0,95,9


New merge

In [20]:
new_df = pd.merge(df1, df_combined, on="Station", how='inner')
nan_rows = new_df [new_df .isnull().T.any().T]
nan_rows.Station.unique()

array([], dtype=object)

In [21]:
new_df.head(1)

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode,unix,Exits,Entries
0,Acton Town,519457,179639,51.503071,-0.280303,3,W3 8HN,1481854000.0,95,9


In [22]:
# drop columns that are not going to be used
new_df = new_df.drop(['OS X', 'OS Y', 'Zone', 'Postcode'], axis=1)
new_df.head(1)

Unnamed: 0,Station,Latitude,Longitude,unix,Exits,Entries
0,Acton Town,51.503071,-0.280303,1481854000.0,95,9


Save new dataframe as CSV file 

In [23]:
new_df.to_csv('London stations final unixtime coordinates.csv',index = False, header = True)

#### Create unique list of UNIX timestamps

In [24]:
unique_unixtimes = [df2.unix.unique()]

In [25]:
unix = []
for i in unique_unixtimes:
    for j in i:
        u = [j]
        unix = unix + u
len(unix)
# 324 unique unixtimes 

324

In [26]:
unique_unixtimes
# copy array into javascript

[array([  1.47156840e+09,   1.47157020e+09,   1.47157200e+09,
          1.47157380e+09,   1.47157560e+09,   1.47157740e+09,
          1.47165480e+09,   1.47165660e+09,   1.47165840e+09,
          1.47166020e+09,   1.47166200e+09,   1.47166380e+09,
          1.47217320e+09,   1.47217500e+09,   1.47217680e+09,
          1.47217860e+09,   1.47218040e+09,   1.47218220e+09,
          1.47225960e+09,   1.47226140e+09,   1.47226320e+09,
          1.47226500e+09,   1.47226680e+09,   1.47226860e+09,
          1.47277800e+09,   1.47277980e+09,   1.47278160e+09,
          1.47278340e+09,   1.47278520e+09,   1.47278700e+09,
          1.47286440e+09,   1.47286620e+09,   1.47286800e+09,
          1.47286980e+09,   1.47287160e+09,   1.47287340e+09,
          1.47338280e+09,   1.47338460e+09,   1.47338640e+09,
          1.47338820e+09,   1.47339000e+09,   1.47339180e+09,
          1.47346920e+09,   1.47347100e+09,   1.47347280e+09,
          1.47347460e+09,   1.47347640e+09,   1.47347820e+09,
        