# Ex 2.5 Advanced Geospatial Plotting

### 2. Importing libraries and data 

In [1]:
# Had to use 
# conda install -c conda-forge proj pyproj geopandas
# then
# pip install keplergl
# because Kepler.gl depends on GeoPandas → pyproj → PROJ and pip was trying to build pyproj from source, 
# but on macOS that requires the system PROJ library, which was not already installed.

In [1]:
import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS 
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# all libraries now import successfully

In [3]:
# Define path
path = r'/Users/andymiller/Desktop/CareerFoundry Data Analytics/3. Data Visualisation with Python/2. Citi Bike'

## Building the map data and map the first time 

In [8]:
# Import data from pickle file

df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ny_data_ex_2.4.pkl'))

In [16]:
df.head()

Unnamed: 0.1,date,Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,...,member_casual,avgTemp,_merge,no_of_trips,_merge_2,trip_duration,value,day_of_week,day_type,start_hour
0,2022-01-01,29246414,A68CF80FD4099195,electric_bike,2022-01-01 04:27:20.917,2022-01-01 05:12:45.261,W 107 St & Columbus Ave,7619.05,W 110 St & Amsterdam Ave,7646.04,...,member,11.6,both,20428,both,45.405733,1,Saturday,Weekend,4
1,2022-01-01,28544530,25C911DDFF549E96,classic_bike,2022-01-01 12:58:09.835,2022-01-01 13:31:53.029,Broadway & W 61 St,7014.12,Park Ave & E 124 St,7682.01,...,member,11.6,both,20428,both,33.7199,1,Saturday,Weekend,12
2,2022-01-01,28544606,E67B8B6533210284,electric_bike,2022-01-01 06:40:11.372,2022-01-01 06:48:52.852,E 85 St & York Ave,7146.04,W 87 St & Amsterdam Ave,7458.03,...,member,11.6,both,20428,both,8.691333,1,Saturday,Weekend,6
3,2022-01-01,28544612,AD0DA4602B983464,classic_bike,2022-01-01 01:41:34.103,2022-01-01 01:51:56.845,Fulton St & Waverly Ave,4345.11,Sullivan Pl & Franklin Ave,3704.08,...,casual,11.6,both,20428,both,10.379033,1,Saturday,Weekend,1
4,2022-01-01,28544634,21E1ADCF4F034ED0,electric_bike,2022-01-01 02:50:32.445,2022-01-01 02:56:45.541,Central Park W & W 91 St,7453.01,E 97 St & 3 Ave,7365.08,...,member,11.6,both,20428,both,6.218267,1,Saturday,Weekend,2


In [17]:
df.shape

(29838166, 24)

### 3. Create aggregated dateframe 

In [18]:
# Create grouped dataframe using 'start_station_name' and 'end_station_name'

df['value'] = 1
df_group = df.groupby(['start_station_name', 'end_station_name'])['value'].count().reset_index()

In [19]:
df_group

Unnamed: 0,start_station_name,end_station_name,value
0,1 Ave & E 110 St,1 Ave & E 110 St,791
1,1 Ave & E 110 St,1 Ave & E 18 St,2
2,1 Ave & E 110 St,1 Ave & E 30 St,4
3,1 Ave & E 110 St,1 Ave & E 39 St,1
4,1 Ave & E 110 St,1 Ave & E 44 St,12
...,...,...,...
1013392,Yankee Ferry Terminal,Water St & Main St,4
1013393,Yankee Ferry Terminal,West St & Chambers St,6
1013394,Yankee Ferry Terminal,West St & Liberty St,4
1013395,Yankee Ferry Terminal,West Thames St,1


In [20]:
# Check that all trips are accounted for

print(df_group['value'].sum())
print(df.shape)

29768282
(29838166, 24)


In [23]:
# There are approx 70000 trips missing.  Let's find out why.
# Look for missing station names in the data:

missing_keys = df[df['start_station_name'].isna() | df['end_station_name'].isna()]

missing_keys.shape[0]

69884

In [24]:
29838166-29768282

69884

In [25]:
# There are 69884 rows with a missing start or end station which explains the discrepancy

In [26]:
df['start_station_name'].isna().sum()

49

In [27]:
df['end_station_name'].isna().sum()

69884

In [40]:
# All of these 69884 are missing the end station and 49 are also missing the start station.
# These trips just won't be included in the map since there's no accurate way to plot them.

In [28]:
# Rename 'value' column

df_group.rename(columns = {'value':'trips'}, inplace = True)

In [29]:
df_group.head()

Unnamed: 0,start_station_name,end_station_name,trips
0,1 Ave & E 110 St,1 Ave & E 110 St,791
1,1 Ave & E 110 St,1 Ave & E 18 St,2
2,1 Ave & E 110 St,1 Ave & E 30 St,4
3,1 Ave & E 110 St,1 Ave & E 39 St,1
4,1 Ave & E 110 St,1 Ave & E 44 St,12


In [33]:
# Need to merge in the coordinates for the start and end stations from the main dataframe

In [34]:
df.columns

Index(['date', 'Unnamed: 0', 'ride_id', 'rideable_type', 'start_time',
       'end_time', 'start_station_name', 'start_station_id',
       'end_station_name', 'end_station_id', 'start_lat', 'start_lng',
       'end_lat', 'end_lng', 'member_casual', 'avgTemp', '_merge',
       'no_of_trips', '_merge_2', 'trip_duration', 'value', 'day_of_week',
       'day_type', 'start_hour'],
      dtype='object')

In [35]:
# Create station lookup tables from the big dataframe
# Start station lookup:

start_stations = (df[['start_station_name', 'start_lat', 'start_lng']].dropna(subset=['start_station_name'])
                  .drop_duplicates(subset=['start_station_name']).rename(columns={'start_station_name':'station_name'}))

In [36]:
start_stations

Unnamed: 0,station_name,start_lat,start_lng
0,W 107 St & Columbus Ave,40.799757,-73.962113
1,Broadway & W 61 St,40.770030,-73.981968
2,E 85 St & York Ave,40.775369,-73.948034
3,Fulton St & Waverly Ave,40.683239,-73.965996
4,Central Park W & W 91 St,40.788665,-73.966801
...,...,...,...
29238806,Columbus Pl & Atlantic Ave,40.677170,-73.922850
29473092,Troy Ave & Park Pl,40.672321,-73.936480
29518694,College Ave & E 169 St,40.835110,-73.911958
29764728,Lincoln Pl & Nostrand Ave,40.670770,-73.950700


In [37]:
# Repeat this for end_stations

end_stations = (df[['end_station_name', 'end_lat', 'end_lng']].dropna(subset=['end_station_name'])
                  .drop_duplicates(subset=['end_station_name']).rename(columns={'end_station_name':'station_name'}))

In [38]:
end_stations

Unnamed: 0,station_name,end_lat,end_lng
0,W 110 St & Amsterdam Ave,40.802692,-73.962950
1,Park Ave & E 124 St,40.804555,-73.939686
2,W 87 St & Amsterdam Ave,40.788390,-73.974700
3,Sullivan Pl & Franklin Ave,40.664080,-73.960251
4,E 97 St & 3 Ave,40.785920,-73.948603
...,...,...,...
29226047,Columbus Pl & Atlantic Ave,40.677170,-73.922850
29506345,Troy Ave & Park Pl,40.672510,-73.936420
29762040,Lincoln Pl & Nostrand Ave,40.670770,-73.950700
29771282,Lexington Ave & Stuyvesant Ave,40.689730,-73.933530


In [43]:
#  Merge start_stations into df_group

df_group = df_group.merge(start_stations, left_on = 'start_station_name', right_on = 'station_name', how = 'left').drop(columns='station_name')

In [44]:
df_group.head()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng
0,1 Ave & E 110 St,1 Ave & E 110 St,791,40.792327,-73.9383
1,1 Ave & E 110 St,1 Ave & E 18 St,2,40.792327,-73.9383
2,1 Ave & E 110 St,1 Ave & E 30 St,4,40.792327,-73.9383
3,1 Ave & E 110 St,1 Ave & E 39 St,1,40.792327,-73.9383
4,1 Ave & E 110 St,1 Ave & E 44 St,12,40.792327,-73.9383


In [45]:
df_group.tail()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng
1013392,Yankee Ferry Terminal,Water St & Main St,4,40.687066,-74.016756
1013393,Yankee Ferry Terminal,West St & Chambers St,6,40.687066,-74.016756
1013394,Yankee Ferry Terminal,West St & Liberty St,4,40.687066,-74.016756
1013395,Yankee Ferry Terminal,West Thames St,1,40.687066,-74.016756
1013396,Yankee Ferry Terminal,Yankee Ferry Terminal,5759,40.687066,-74.016756


In [46]:
# This has worked, so now we can merge in the coordinates for the end_stations 

df_group = df_group.merge(end_stations, left_on = 'end_station_name', right_on = 'station_name', how = 'left').drop(columns='station_name')

In [47]:
df_group

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng
0,1 Ave & E 110 St,1 Ave & E 110 St,791,40.792327,-73.938300,40.792327,-73.938300
1,1 Ave & E 110 St,1 Ave & E 18 St,2,40.792327,-73.938300,40.733812,-73.980544
2,1 Ave & E 110 St,1 Ave & E 30 St,4,40.792327,-73.938300,40.741444,-73.975361
3,1 Ave & E 110 St,1 Ave & E 39 St,1,40.792327,-73.938300,40.747140,-73.971130
4,1 Ave & E 110 St,1 Ave & E 44 St,12,40.792327,-73.938300,40.750020,-73.969053
...,...,...,...,...,...,...,...
1013392,Yankee Ferry Terminal,Water St & Main St,4,40.687066,-74.016756,40.703212,-73.990409
1013393,Yankee Ferry Terminal,West St & Chambers St,6,40.687066,-74.016756,40.717548,-74.013221
1013394,Yankee Ferry Terminal,West St & Liberty St,4,40.687066,-74.016756,40.711444,-74.014847
1013395,Yankee Ferry Terminal,West Thames St,1,40.687066,-74.016756,40.708347,-74.017134


In [53]:
df_group['trips'].describe()

count    1.013397e+06
mean     2.937475e+01
std      9.925180e+01
min      1.000000e+00
25%      1.000000e+00
50%      4.000000e+00
75%      1.700000e+01
max      1.204100e+04
Name: trips, dtype: float64

In [50]:
# Check for any missing coordinates

df_group[['start_lat','start_lng','end_lat','end_lng']].isna().sum()

start_lat    0
start_lng    0
end_lat      0
end_lng      0
dtype: int64

In [49]:
# Export df_group to csv

df_group.to_csv(os.path.join(path,'02 Data','Prepared Data','ny_trips.csv'))

### 4. Initialize an instance of a kepler.gl map (Un-usable due to lag)

In [None]:
# Re-import for editing

df_group = pd.read_csv(os.path.join(path,'02 Data','Prepared Data','ny_trips.csv'))

In [51]:
# The data is now ready for KeplerGl

# Create KeplerGl instance
m = KeplerGl(height = 700, data={"Trips": df_group})

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [52]:
m

KeplerGl(data={'Trips':             start_station_name       end_station_name  trips  start_lat  \
0          …

The map looks great but it is almost entirely unusable due to how slow it is.  I'm going to attempt to plot only the top 1000 origin-destination pairs and round the coordinates so that the map can hopefully be usable, then I'll be able to actually look at it and see what stands out.

### 4. Initialize an instance of a kepler.gl map (Successful)

In [58]:
# Take only the top 1000 most popular trips

df_kepler = (df_group.sort_values('trips', ascending=False).head(1000))

In [59]:
# Round coordinates to 4 decimal places.  This is still accurate to 11m which is enough for this analysis.

for col in ['start_lat', 'start_lng', 'end_lat', 'end_lng']:
    df_kepler[col] = df_kepler[col].round(4)


In [60]:
df_kepler.head()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng
294971,Central Park S & 6 Ave,Central Park S & 6 Ave,12041,40.7659,-73.9763,40.7659,-73.9763
147754,7 Ave & Central Park South,7 Ave & Central Park South,8541,40.7667,-73.9791,40.7667,-73.9791
782289,Roosevelt Island Tramway,Roosevelt Island Tramway,8213,40.7573,-73.9536,40.7573,-73.9536
548187,Grand Army Plaza & Central Park S,Grand Army Plaza & Central Park S,7287,40.7644,-73.9737,40.7644,-73.9737
800503,Soissons Landing,Soissons Landing,7275,40.6923,-74.0149,40.6923,-74.0149


In [23]:
# Create new KeplerGl map
m_2 = KeplerGl(height = 1000, data={"Trips": df_kepler})

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [24]:
m_2

KeplerGl(data={'Trips':      Unnamed: 0                  start_station_name  \
0        294971              Ce…

In [36]:
m_2.save_to_html(file_name='Most popular trips_v3.html')

Map saved to Most popular trips_v3.html!


In [71]:
# Export filtered dataset as csv

# Export df_group to csv

df_kepler.to_csv(os.path.join(path,'02 Data','Prepared Data','ny_trips_filtered.csv'))

In [38]:
print((df_kepler['trips'] >= 3500).sum())

47


### 5. Customize the output of the map and explain reasoning 

This map works a lot better and is sufficient for the purpose of answering the questions in the task and in the overall analysis.

I picked the colours because they contrast well against the dark background of the map.  I like the arcs with the different colours for the start and end points.  The colour scheme is consistent with the Flare style that I chose for the project in one of the earlier exercises.

### 6. Add filter to the map and comment on common trips 

Looking at the map the first thing I notice is how busy it is at the southern end of Central Park. In fact, the top 2 trips which are round trips starting from Central Park South & 6th Ave and 7th Ave & Central Park South suggest that one of the most popular uses of CitiBiki is to ride around Central Park. Other trips starting and ending at stations around the edges of the park are also very popular routes. The route from the south of the park to the north is also popular. This all makes perfect sense because that sounds lovely! Riding in Central Park is definitely one of the more relaxing ways to ride a bike in New York City!

There are also several popular routes on Roosevelt Island near the tramway which is actually a cable car joining the Island to Manhattan's Upper East Side. 

There are also many frequently-traveled routes along the Hudson River on the west side of Manhattan on 10th Ave and 12th Ave.  These routes travel along the scenic Hudson River Greenway which is said to offer stunning views of the Hudson River all the way from Battery Park (in the south) to the Upper West Side.  There are dedicated cycle paths here which makes sense considering the number of trips we see on the map.

W21st street has a lot of popular routes and it turns out that there are dedicated crosstown bike lanes here that connect to major north-south route such as the Hudson River Greenway.  This area in the Flatiron neighborhood is said to be a good starting point for various urban and scenic rides.



### 7. Create config object and export map 

In [69]:
# Save settings and export map
config =m_2.config

In [67]:
# Export map as html file

m_2.save_to_html(file_name=os.path.join(path, '04 Analysis', 'Visualisations','NY Bike Trips Aggregated_v3.html'), read_only = False, config = config)

Map saved to /Users/andymiller/Desktop/CareerFoundry Data Analytics/3. Data Visualisation with Python/2. Citi Bike/04 Analysis/Visualisations/NY Bike Trips Aggregated_v2.html!


In [68]:
# Export config file as json

import json
with open("config.json", "w") as outfile:
    json.dump(config, outfile)

## Build new map for the final dashboard 

In [4]:
df_group = pd.read_csv(os.path.join(path,'02 Data','Prepared Data','ny_trips.csv'))

In [5]:
df_group

Unnamed: 0.1,Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng
0,0,1 Ave & E 110 St,1 Ave & E 110 St,791,40.792327,-73.938300,40.792327,-73.938300
1,1,1 Ave & E 110 St,1 Ave & E 18 St,2,40.792327,-73.938300,40.733812,-73.980544
2,2,1 Ave & E 110 St,1 Ave & E 30 St,4,40.792327,-73.938300,40.741444,-73.975361
3,3,1 Ave & E 110 St,1 Ave & E 39 St,1,40.792327,-73.938300,40.747140,-73.971130
4,4,1 Ave & E 110 St,1 Ave & E 44 St,12,40.792327,-73.938300,40.750020,-73.969053
...,...,...,...,...,...,...,...,...
1013392,1013392,Yankee Ferry Terminal,Water St & Main St,4,40.687066,-74.016756,40.703212,-73.990409
1013393,1013393,Yankee Ferry Terminal,West St & Chambers St,6,40.687066,-74.016756,40.717548,-74.013221
1013394,1013394,Yankee Ferry Terminal,West St & Liberty St,4,40.687066,-74.016756,40.711444,-74.014847
1013395,1013395,Yankee Ferry Terminal,West Thames St,1,40.687066,-74.016756,40.708347,-74.017134


In [12]:
# Round coordinates to 4 decimal places.  This is still accurate to 11m which is enough for this analysis.

for col in ['start_lat', 'start_lng', 'end_lat', 'end_lng']:
    df_group[col] = df_group[col].round(4)

In [6]:
# Reimport data for new version of the map

df_kepler = pd.read_csv(os.path.join(path,'02 Data','Prepared Data','ny_trips_filtered.csv'))

In [7]:
df_kepler

Unnamed: 0.1,Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng
0,294971,Central Park S & 6 Ave,Central Park S & 6 Ave,12041,40.7659,-73.9763,40.7659,-73.9763
1,147754,7 Ave & Central Park South,7 Ave & Central Park South,8541,40.7667,-73.9791,40.7667,-73.9791
2,782289,Roosevelt Island Tramway,Roosevelt Island Tramway,8213,40.7573,-73.9536,40.7573,-73.9536
3,548187,Grand Army Plaza & Central Park S,Grand Army Plaza & Central Park S,7287,40.7644,-73.9737,40.7644,-73.9737
4,800503,Soissons Landing,Soissons Landing,7275,40.6923,-74.0149,40.6923,-74.0149
...,...,...,...,...,...,...,...,...
995,523656,Frederick Douglass Blvd & W 115 St,Frederick Douglass Blvd & W 112 St,1183,40.8039,-73.9559,40.8017,-73.9571
996,927699,W 44 St & 11 Ave,8 Ave & W 33 St,1183,40.7620,-73.9970,40.7516,-73.9939
997,99272,44 Dr & Jackson Ave,44 Dr & Jackson Ave,1182,40.7472,-73.9433,40.7472,-73.9433
998,788902,Sands St & Jay St,Forsyth St & Canal St,1182,40.7001,-73.9862,40.7158,-73.9942


In [15]:
df_group.columns

Index(['Unnamed: 0', 'start_station_name', 'end_station_name', 'trips',
       'start_lat', 'start_lng', 'end_lat', 'end_lng'],
      dtype='object')

In [16]:
# Create dataframe of number unique start stations with the number of departures

start_stations = df_group.groupby('start_station_name').agg({
    'trips': 'sum',           # Sum all trips for each start station
    'start_lat': 'first',     # Take the first lat value for each station
    'start_lng': 'first'      # Take the first lng value for each station
}).reset_index()

# Rename columns
start_stations.columns = ['station_name', 'total_departures', 'latitude', 'longitude']

In [17]:
start_stations

Unnamed: 0,station_name,total_departures,latitude,longitude
0,1 Ave & E 110 St,21478,40.7923,-73.9383
1,1 Ave & E 16 St,67236,40.7322,-73.9817
2,1 Ave & E 18 St,70731,40.7338,-73.9805
3,1 Ave & E 30 St,45341,40.7414,-73.9754
4,1 Ave & E 39 St,52287,40.7471,-73.9711
...,...,...,...,...
1756,Wyckoff Ave & Gates Ave,16516,40.6999,-73.9117
1757,Wyckoff St & 3 Ave,250,40.6827,-73.9827
1758,Wyckoff St & Nevins St,11503,40.6834,-73.9843
1759,Wythe Ave & Metropolitan Ave,66340,40.7169,-73.9632


In [18]:
# Create new KeplerGl map
m_5 = KeplerGl(height = 500, data={"Trips": df_kepler})
m_5

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'total_departures':                       station_name  total_departures  latitude  longitude
0…

In [35]:
# Save the map as HTML (with all settings included)

m_5.save_to_html(file_name='start_stations_v3.html')

# Save the config separately as JSON
config = m_5.config
import json
with open('map_config.json', 'w') as f:
    json.dump(config, f, indent=2)

Map saved to start_stations_v3.html!


In [20]:
# Check current working directory
print("Current directory:", os.getcwd())

# Check if file exists in current directory
print("File exists:", os.path.exists("start_stations.html"))

# List all files in current directory
print("Files in directory:", os.listdir('.'))

# If file is elsewhere, find it
for root, dirs, files in os.walk('.'):
    if 'start_stations.html' in files:
        print(f"Found at: {os.path.join(root, 'start_stations.html')}")

Current directory: /Users/andymiller/Desktop/CareerFoundry Data Analytics/3. Data Visualisation with Python/2. Citi Bike/03 Scripts
File exists: True
Files in directory: ['Ex 2.7 Creating the Dashboard.ipynb', '.DS_Store', 'Ex 2.6 Creating Dashboards with Python.ipynb', 'Ex 2.2 Sourcing Data with an API.ipynb', 'Requirements', 'Examples', 'Ex 2.5 Advanced Geospatial Plotting_v2.ipynb', 'Ex 2.5 Advanced Geospatial Plotting.ipynb', '.ipynb_checkpoints', 'Ex 2.4 Fundamentals of Visualization Libraries - Part 2.ipynb', 'Ex 2.3 Fundamentals of Visualization Libraries Part 1.ipynb', 'station_imbalance_to_graph.csv', 'map_config.json', 'start_stations.html', 'Ex 2.5 Advanced Geospatial Plotting_v3.ipynb']
Found at: ./start_stations.html


In [30]:
# Sort into decending order

start_stations = start_stations.sort_values('total_departures', ascending =False)

In [32]:
start_stations.to_csv('start_stations.csv')

In [31]:
start_stations.head(20)

Unnamed: 0,station_name,total_departures,latitude,longitude
1587,W 21 St & 6 Ave,128823,40.7417,-73.9942
1718,West St & Chambers St,123045,40.7175,-74.0132
495,Broadway & W 58 St,114040,40.767,-73.9817
286,6 Ave & W 33 St,106236,40.749,-73.9885
8,1 Ave & E 68 St,104685,40.765,-73.9582
461,Broadway & E 14 St,98656,40.7345,-73.9907
485,Broadway & W 25 St,98237,40.7429,-73.9892
1511,University Pl & E 14 St,96941,40.7348,-73.9921
463,Broadway & E 21 St,95533,40.7399,-73.9896
1603,W 31 St & 7 Ave,94035,40.7492,-73.9916


In [33]:
start_stations.columns

Index(['station_name', 'total_departures', 'latitude', 'longitude'], dtype='object')