In [1]:
import pandas as pd
import regex as re

[The source](https://data.cityofnewyork.us/Transportation/Subway-Entrances/drex-xx56) of the subway entrance data.

In [46]:
subway = pd.read_csv('../data_large_sets/DOITT_SUBWAY_ENTRANCE_01_13SEPT2010.csv')

In [47]:
subway.head()

Unnamed: 0,OBJECTID,URL,NAME,the_geom,LINE
0,1734,http://web.mta.info/nyct/service/,Birchall Ave & Sagamore St at NW corner,POINT (-73.86835600032798 40.84916900104506),2-5
1,1735,http://web.mta.info/nyct/service/,Birchall Ave & Sagamore St at NE corner,POINT (-73.86821300022677 40.84912800131844),2-5
2,1736,http://web.mta.info/nyct/service/,Morris Park Ave & 180th St at NW corner,POINT (-73.87349900050798 40.84122300105249),2-5
3,1737,http://web.mta.info/nyct/service/,Morris Park Ave & 180th St at NW corner,POINT (-73.8728919997833 40.84145300067447),2-5
4,1738,http://web.mta.info/nyct/service/,Boston Rd & 178th St at SW corner,POINT (-73.87962300013866 40.84081500075867),2-5


In [48]:
subway.columns = subway.columns.str.lower() 

In [49]:
subway.dtypes

objectid     int64
url         object
name        object
the_geom    object
line        object
dtype: object

In [50]:
subway.drop(columns = 'url', inplace = True)

In [51]:
subway.shape

(1928, 4)

In [52]:
test = "POINT (-73.86835600032798 40.84916900104506)"

In [12]:
test[7:]

'-73.86835600032798 40.84916900104506)'

In [14]:
test[7:-1]

'-73.86835600032798 40.84916900104506'

This could work, but I'm concerned all the strings aren't the same length. Using regex.

[This site](https://statisticsglobe.com/extract-substring-before-or-after-pattern-in-r) helped me figure out how to write the regex for extracting starting at the '-'. This [stackoverflow answer](https://stackoverflow.com/a/18213017) helped me write the regex to get everything before the final comma.

[Regex101.com](https://regex101.com/) helped me confirm I had it right.

[This site](https://linuxhint.com/extract-substring-regex-python/) helped me understand what regex function to use to extract the text I wanted.



In [20]:
test2 = re.search('-.*', test).group(0)

In [21]:
test2

'-73.86835600032798 40.84916900104506)'

In [22]:
test3 = re.search('^[^\)]*', test2).group(0)

In [23]:
test3

'-73.86835600032798 40.84916900104506'

In [25]:
test4 = test3.split()
test4

['-73.86835600032798', '40.84916900104506']

In [28]:
test_long = float(test4[0])
test_lat = float(test4[1])

print(f'Lat: {test_lat}, Long: {test_long}')

Lat: 40.84916900104506, Long: -73.86835600032798


Applying the above process to the dataframe

In [53]:
pair = lambda x: re.search('^[^\)]*', re.search('-.*', x).group(0)).group(0).split()

In [54]:
pair(test)

['-73.86835600032798', '40.84916900104506']

In [55]:
subway['the_geom'] = subway['the_geom'].map(pair)
subway.head()

Unnamed: 0,objectid,name,the_geom,line
0,1734,Birchall Ave & Sagamore St at NW corner,"[-73.86835600032798, 40.84916900104506]",2-5
1,1735,Birchall Ave & Sagamore St at NE corner,"[-73.86821300022677, 40.84912800131844]",2-5
2,1736,Morris Park Ave & 180th St at NW corner,"[-73.87349900050798, 40.84122300105249]",2-5
3,1737,Morris Park Ave & 180th St at NW corner,"[-73.8728919997833, 40.84145300067447]",2-5
4,1738,Boston Rd & 178th St at SW corner,"[-73.87962300013866, 40.84081500075867]",2-5


In [56]:
subway['latitude'] = subway['the_geom'].map(lambda x: float(x[1]))
subway.head()

Unnamed: 0,objectid,name,the_geom,line,latitude
0,1734,Birchall Ave & Sagamore St at NW corner,"[-73.86835600032798, 40.84916900104506]",2-5,40.849169
1,1735,Birchall Ave & Sagamore St at NE corner,"[-73.86821300022677, 40.84912800131844]",2-5,40.849128
2,1736,Morris Park Ave & 180th St at NW corner,"[-73.87349900050798, 40.84122300105249]",2-5,40.841223
3,1737,Morris Park Ave & 180th St at NW corner,"[-73.8728919997833, 40.84145300067447]",2-5,40.841453
4,1738,Boston Rd & 178th St at SW corner,"[-73.87962300013866, 40.84081500075867]",2-5,40.840815


In [57]:
subway['longitude'] = subway['the_geom'].map(lambda x: float(x[0]))
subway.head()

Unnamed: 0,objectid,name,the_geom,line,latitude,longitude
0,1734,Birchall Ave & Sagamore St at NW corner,"[-73.86835600032798, 40.84916900104506]",2-5,40.849169,-73.868356
1,1735,Birchall Ave & Sagamore St at NE corner,"[-73.86821300022677, 40.84912800131844]",2-5,40.849128,-73.868213
2,1736,Morris Park Ave & 180th St at NW corner,"[-73.87349900050798, 40.84122300105249]",2-5,40.841223,-73.873499
3,1737,Morris Park Ave & 180th St at NW corner,"[-73.8728919997833, 40.84145300067447]",2-5,40.841453,-73.872892
4,1738,Boston Rd & 178th St at SW corner,"[-73.87962300013866, 40.84081500075867]",2-5,40.840815,-73.879623


In [58]:
subway.drop(columns = 'the_geom', inplace = True)
subway.head()

Unnamed: 0,objectid,name,line,latitude,longitude
0,1734,Birchall Ave & Sagamore St at NW corner,2-5,40.849169,-73.868356
1,1735,Birchall Ave & Sagamore St at NE corner,2-5,40.849128,-73.868213
2,1736,Morris Park Ave & 180th St at NW corner,2-5,40.841223,-73.873499
3,1737,Morris Park Ave & 180th St at NW corner,2-5,40.841453,-73.872892
4,1738,Boston Rd & 178th St at SW corner,2-5,40.840815,-73.879623


# Recent Subway Extensions
Confirmed that 2nd Ave Phase 1 and 7 train extension are included.

In [6]:
subway[(subway['line'].str.contains('Q')) & (subway['name'].str.contains('2nd'))]

Unnamed: 0,objectid,name,line,latitude,longitude
124,1858,Broadway & 32nd St at NE corner,B-D-F-M-N-Q-R,40.748294,-73.987976
125,1859,Broadway & 32nd St at NW corner,B-D-F-M-N-Q-R,40.748477,-73.988288
406,262,7th Ave & 42nd St at NW corner,A-C-E-N-Q-R-S-1-2-3-7,40.756241,-73.987133
407,263,7th Ave & 42nd St at SW corner,A-C-E-N-Q-R-S-1-2-3-7,40.756042,-73.987242
895,751,8th Ave & 42nd St at NE corner,A-C-E-N-Q-R-S-1-2-3-7,40.757286,-73.989582
1293,1149,Broadway & 42nd St at SE corner,A-C-E-N-Q-R-S-1-2-3-7,40.755575,-73.986311
1498,1354,8th Ave & 42nd St at NW corner,A-C-E-N-Q-R-S-1-2-3-7,40.757463,-73.989984
1554,1410,6th Ave & 32nd St at NW corner,B-D-F-M-N-Q-R,40.748746,-73.988774
1563,1419,7th Ave & 42nd St at SE corner,A-C-E-N-Q-R-S-1-2-3-7,40.755747,-73.986875
1916,1921,2nd Ave & 72nd St at NW corner,Q,40.768993,-73.958567


In [8]:
subway[(subway['line'].str.contains('7')) & (subway['name'].str.contains('Hudson'))]

Unnamed: 0,objectid,name,line,latitude,longitude
1226,1082,Hudson Blvd East between 33rd St & 34th St (no...,7,40.754975,-74.000924
1227,1083,Hudson Blvd East between 33rd St & 34th St (so...,7,40.754898,-74.000982


In [60]:
subway.to_csv('data/subway_cleaned.csv', index = False)