In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import os

### Get transport features from buildings and landmarks
Landmarks include: (mostly) railway stations, (1) Marina, (1) transport terminal and (1) bridge  
Buildings: there are 157, but it's not clear what they are

In [2]:
def create_transport_df(transport_fp):
    # read in csv
    transport_df = pd.read_csv(transport_fp)
    # Split into latitude and longitude, and convert these to numeric
    #transport_df[['Latitude', 'Longitude']] = transport_df['Geo Point'].str.split(',', 1, expand=True)
    transport_df[['Latitude', 'Longitude']] = transport_df['Geo Point'].str.split(',', expand=True)
    transport_df[['Latitude', 'Longitude']]= transport_df[['Latitude', 'Longitude']].apply(pd.to_numeric)
    return transport_df

def create_other_features_gdf(fp):
    df = pd.read_csv(fp)
    geometry = [Point(xy) for xy in zip(df.Longitude, df.Latitude)]
    gdf = gpd.GeoDataFrame(df, geometry=geometry)
    return gdf


# Directory to save the cleaned data
cleaned_data_folder_fp = '../../Cleaned_data/SpatialFeatures/'
os.makedirs(cleaned_data_folder_fp, exist_ok = True)

### Read in new extra transport features

In [3]:
tram_stops_df = create_transport_df("../../Data/TransportHubs/tram-stops.csv")
metro_stations_df = create_transport_df("../../Data/TransportHubs/metro-stations.csv")
bus_stops_df = create_transport_df("../../Data/TransportHubs/bus-stops.csv")
taxis_df = create_transport_df('../../Data/TransportHubs/taxi-ranks.csv')
car_parks_df = create_other_features_gdf('../../Data/TransportHubs/offstreet-car-parks.csv')
big_car_parks = car_parks_df[car_parks_df['Parking spaces']>20]

In [4]:
# Trim to just be the columns found in all
big_car_parks =  big_car_parks[['Latitude','Longitude' ]].copy()
tram_stops_df = tram_stops_df[['Latitude','Longitude' ]].copy()
metro_stations_df = metro_stations_df[['Latitude', 'Longitude']].copy()
bus_stops_df = bus_stops_df[['Latitude', 'Longitude']].copy()

In [5]:
# Python check for duplicates
for df in [bus_stops_df, tram_stops_df, metro_stations_df, taxis_df, big_car_parks]:
    print(df.duplicated(subset=['Latitude', 'Longitude']).sum())
    
# Keep only one of the duplicates
big_car_parks = big_car_parks.drop_duplicates(subset=['Latitude', 'Longitude'], keep=False)
bus_stops_df = bus_stops_df.drop_duplicates(subset=['Latitude', 'Longitude'], keep=False)

# Python check duplicates removed successfully
for df in [bus_stops_df, tram_stops_df, metro_stations_df, taxis_df, big_car_parks]:
    print(df.duplicated(subset=['Latitude', 'Longitude']).sum())

14
0
0
0
17680
0
0
0
0
0


### Add a type flag

In [6]:
tram_stops_df['Type'] = 'tram_stop'
metro_stations_df['Type'] = 'metro_stop'
bus_stops_df['Type'] = 'bus_stop'
taxis_df['Type'] = 'taxi_rank'
big_car_parks['Type'] = 'big_car_parks'

### Save cleaned dataframes to file

In [7]:
tram_stops_df.to_csv(cleaned_data_folder_fp + "/tram-stops_clean.csv", index=False)
metro_stations_df.to_csv(cleaned_data_folder_fp + "/metro-stations_clean.csv", index=False)
bus_stops_df.to_csv(cleaned_data_folder_fp + "/bus-stops_clean.csv", index=False)
taxis_df.to_csv(cleaned_data_folder_fp + "/taxi-ranks_clean.csv", index=False)
big_car_parks.to_csv(cleaned_data_folder_fp + "/big-car-parks_clean.csv", index=False)

### Join all 'stops' data into one

In [8]:
# Join
all_transport_stops = pd.concat([tram_stops_df, metro_stations_df,bus_stops_df], axis=0)
# Check for duplicates
print(len(all_transport_stops.loc[all_transport_stops.duplicated(subset=['Latitude', 'Longitude']), :]))
# Save
all_transport_stops.to_csv(cleaned_data_folder_fp + "/transport_stops_clean.csv", index=False)

0


### Check for features duplicated in landmarks or buildings transport data

In [10]:
landmarks = pd.read_csv(cleaned_data_folder_fp + "/landmarks_clean.csv",)
landmark_transport = landmarks[landmarks['theme']=='Transport']
landmark_transport = landmark_transport[['Latitude','Longitude' ]].copy()
landmark_transport['Type'] = 'landmark_transport'
landmark_transport.loc[landmark_transport.duplicated(subset=['Latitude', 'Longitude']), :]

buildings = pd.read_csv(cleaned_data_folder_fp + "/buildings_clean.csv",)
buildings2019 = buildings[buildings['year']==2019]
buildings2019_transport = buildings2019[buildings2019['building_use']=='Transport']
buildings2019_transport = buildings2019_transport[['Latitude','Longitude' ]].copy()
buildings2019_transport['Type'] = 'buildings_transport'

In [11]:
all_transport_stops = pd.concat([all_transport_stops, buildings2019_transport, landmark_transport], axis=0)

In [12]:
all_transport_stops.loc[all_transport_stops.duplicated(keep = False,subset=['Latitude', 'Longitude']), :]

Unnamed: 0,Latitude,Longitude,Type
