In [1]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import datetime
# see all columns
pd.set_option('display.max_columns', None)
# see all rows
pd.set_option('display.max_rows', None)
# set seaborn style
sns.set_style('whitegrid')
# set context to notebook
sns.set_context('notebook')


In [2]:
# import data

# set path
path = '../data/raw/DonkeyRepublic/'

hubs = pd.read_excel(path + 'Hubs_2019-4-2_1201.xlsx', parse_dates=['created_at'])
rentals = pd.read_excel(path + 'Rentals_2019-4-2_1456.xlsx', parse_dates=['created_at', 'finished_at'])

In [3]:
# check data
hubs.head()


Unnamed: 0,created_at,latitude,longitude,id,name,deleted_at
0,2018-05-21 09:35:00.697175,48.864936,2.310624,3268,Cours la Reine,2018-05-21 16:17:24.722283
1,2018-11-26 09:06:05.590590,55.695252,12.547185,6367,Heinesgade,
2,2018-06-05 12:12:28.639837,55.676916,12.564896,3642,Concert Hall Pumpehuset,2018-11-15 10:45:34.468962
3,2018-05-28 20:07:34.173984,55.6687,12.551114,3526,Saxogade,
4,2018-11-08 09:36:50.440822,55.699557,12.515065,6233,GrÃ¸ndal Multicenter,


In [4]:
hubs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   created_at  1266 non-null   datetime64[ns]
 1   latitude    1266 non-null   float64       
 2   longitude   1266 non-null   float64       
 3   id          1266 non-null   int64         
 4   name        1266 non-null   object        
 5   deleted_at  119 non-null    object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 59.5+ KB


In [5]:
# remove deleted_at column
hubs.drop('deleted_at', axis=1, inplace=True)

In [6]:
hubs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   created_at  1266 non-null   datetime64[ns]
 1   latitude    1266 non-null   float64       
 2   longitude   1266 non-null   float64       
 3   id          1266 non-null   int64         
 4   name        1266 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 49.6+ KB


In [7]:
rentals.head()

Unnamed: 0,created_at,finished_at,pickup_hub_id,dropoff_hub_id,user_id
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449.0,108186
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381.0,113852
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513.0,113912
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337.0,113822
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233.0,113881


In [8]:
rentals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279860 entries, 0 to 279859
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   created_at      279860 non-null  object 
 1   finished_at     279408 non-null  object 
 2   pickup_hub_id   279860 non-null  int64  
 3   dropoff_hub_id  261293 non-null  float64
 4   user_id         279860 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 10.7+ MB


In [9]:
# print unique values for id in hubs and for pickup_hub_id and dropoff_hub_id in rentals
print('Hubs id unique values: ', hubs.id.nunique())
print('Pickup hub id unique values: ', rentals.pickup_hub_id.nunique())
print('Dropoff hub id unique values: ', rentals.dropoff_hub_id.nunique())

Hubs id unique values:  1266
Pickup hub id unique values:  1244
Dropoff hub id unique values:  1227


In [10]:
# count nr of nan values in dropoff_hub_id
print('Nr of nan values in dropoff_hub_id: ', rentals.dropoff_hub_id.isna().sum())

Nr of nan values in dropoff_hub_id:  18567


In [11]:
# print dataframe head for rentals with nan values in dropoff_hub_id
rentals[rentals.dropoff_hub_id.isna()].head()

Unnamed: 0,created_at,finished_at,pickup_hub_id,dropoff_hub_id,user_id
24,2018-03-03 11:50:26.336953,2018-03-05 09:54:43.594933,2251,,113998
47,2018-03-07 15:07:37.999381,2018-03-08 09:14:28.575214,2281,,114435
62,2018-03-10 11:00:33.847099,2018-03-10 13:54:22.669873,2227,,27688
118,2018-03-09 15:30:12.161583,2018-03-12 10:00:32.844345,152,,16
132,2018-03-12 10:18:23.874653,2018-03-12 15:45:00.200976,2337,,115368


In [12]:
# see the proportion of nan values in dropoff_hub_id
print('Proportion of nan values in dropoff_hub_id: ', rentals.dropoff_hub_id.isna().sum() / len(rentals))

Proportion of nan values in dropoff_hub_id:  0.0663438862288287


In [13]:
# drop rows with nan values in rentals

rentals.dropna(inplace=True)

In [14]:
rentals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 261189 entries, 0 to 279859
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   created_at      261189 non-null  object 
 1   finished_at     261189 non-null  object 
 2   pickup_hub_id   261189 non-null  int64  
 3   dropoff_hub_id  261189 non-null  float64
 4   user_id         261189 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 12.0+ MB


In [15]:
# inspect how many values are there in rentals with same dropoff and pickup hub
rentals[rentals.pickup_hub_id == rentals.dropoff_hub_id].shape

(61822, 5)

In [16]:
# see rentals time interval
print('Rentals time interval: ', rentals.created_at.min(), rentals.created_at.max())

Rentals time interval:  2018-03-01 08:12:40.421178 2019-04-02 12:39:57.98495


In [17]:
hubs.head()

Unnamed: 0,created_at,latitude,longitude,id,name
0,2018-05-21 09:35:00.697175,48.864936,2.310624,3268,Cours la Reine
1,2018-11-26 09:06:05.590590,55.695252,12.547185,6367,Heinesgade
2,2018-06-05 12:12:28.639837,55.676916,12.564896,3642,Concert Hall Pumpehuset
3,2018-05-28 20:07:34.173984,55.6687,12.551114,3526,Saxogade
4,2018-11-08 09:36:50.440822,55.699557,12.515065,6233,GrÃ¸ndal Multicenter


In [18]:
# in hubs explore names that contain non english characters
hubs[hubs.name.str.contains('[^a-zA-Z0-9\s]')].head()

Unnamed: 0,created_at,latitude,longitude,id,name
4,2018-11-08 09:36:50.440822,55.699557,12.515065,6233,GrÃ¸ndal Multicenter
7,2019-03-14 11:39:00.814359,55.697218,12.584605,7148,SÃ¸nderborggade
8,2018-07-22 15:03:16.438551,55.669671,12.545823,4947,MatthÃ¦usgade
14,2018-11-29 18:53:59.393007,55.616528,12.585753,6411,PilegÃ¥rd Alle
17,2018-12-03 12:31:53.789002,55.637874,12.59054,6434,Hf. Elmebo


In [19]:
# replace Ã¦ with æ in hubs names
# replace Ã¸Ã˜ with ø, Ø in hubs names
# replace Ã¥ with å in hubs names
# replace Ã… with Å in hubs names
# replace Ã© with é in hubs names
# replace Ã¼ with ü in hubs names
# replace Ã¶ with ö in hubs names
# replace non-ascii characters in hubs names
hubs['name'] = hubs['name'].str.replace('Ã¦', 'æ', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã¸', 'ø', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã¥', 'å', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã…', 'Å', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã˜', 'Ø', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã©', 'é', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã¼', 'ü', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã¶', 'ö', regex=True)

# remove entire substring if it is in between [] like [N], [G]...
hubs['name'] = hubs['name'].str.replace('\[[^\]]*\]', '', regex=True)

In [20]:
hubs.head()

Unnamed: 0,created_at,latitude,longitude,id,name
0,2018-05-21 09:35:00.697175,48.864936,2.310624,3268,Cours la Reine
1,2018-11-26 09:06:05.590590,55.695252,12.547185,6367,Heinesgade
2,2018-06-05 12:12:28.639837,55.676916,12.564896,3642,Concert Hall Pumpehuset
3,2018-05-28 20:07:34.173984,55.6687,12.551114,3526,Saxogade
4,2018-11-08 09:36:50.440822,55.699557,12.515065,6233,Grøndal Multicenter


In [21]:
# check how man dropoff_hub_id are not in hubs id
print('Nr of dropoff_hub_id not in hubs id: ', rentals[~rentals.dropoff_hub_id.isin(hubs.id)].dropoff_hub_id.nunique())

Nr of dropoff_hub_id not in hubs id:  36


In [22]:
# merge hubs and rentals on pickup_hub_id
rentals = rentals.merge(hubs[['id', 'latitude', 'longitude', 'name']], how='left', left_on='pickup_hub_id', right_on='id', suffixes=('', 'Start'))
rentals.head()

Unnamed: 0,created_at,finished_at,pickup_hub_id,dropoff_hub_id,user_id,id,latitude,longitude,name
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449.0,108186,2163.0,55.67344,12.564409,Central Station
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381.0,113852,2381.0,55.688937,12.562486,Ravnsborggade
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513.0,113912,1513.0,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337.0,113822,2337.0,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233.0,113881,2153.0,55.680517,12.587455,Nyhavn


In [23]:
# for each unique id in dropoff_hub_id, get the corresponding latitude, longitude and name from hubs
# add these columns to rentals named LatitudeEnd, LongitudeEnd, NameEnd
rentals = rentals.merge(hubs[['id', 'latitude', 'longitude', 'name']], how='left', left_on='dropoff_hub_id', right_on='id', suffixes=('', 'End'))

In [24]:
rentals.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261189 entries, 0 to 261188
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   created_at      261189 non-null  object 
 1   finished_at     261189 non-null  object 
 2   pickup_hub_id   261189 non-null  int64  
 3   dropoff_hub_id  261189 non-null  float64
 4   user_id         261189 non-null  int64  
 5   id              258232 non-null  float64
 6   latitude        258232 non-null  float64
 7   longitude       258232 non-null  float64
 8   name            258232 non-null  object 
 9   idEnd           258209 non-null  float64
 10  latitudeEnd     258209 non-null  float64
 11  longitudeEnd    258209 non-null  float64
 12  nameEnd         258209 non-null  object 
dtypes: float64(7), int64(2), object(4)
memory usage: 25.9+ MB


In [25]:
# drop nan values in rentals
rentals.dropna(inplace=True)

In [26]:
# rename columns, created_at to StartTime, finished_at to EndTime, name to PickupHubName, latitude to LatitudeStart, longitude to LongitudeStart
rentals.rename(columns={'created_at': 'StartTime', 'finished_at': 'EndTime', 'latitude': 'latitudeStart', 
                        'longitude': 'longitudeStart', "pickup_hub_id": "StartHubId", "dropoff_hub_id": "EndHubId",
                        "user_id": "UserId", "name":"nameStart"}, inplace=True)

In [27]:
rentals.head()

Unnamed: 0,StartTime,EndTime,StartHubId,EndHubId,UserId,id,latitudeStart,longitudeStart,nameStart,idEnd,latitudeEnd,longitudeEnd,nameEnd
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449.0,108186,2163.0,55.67344,12.564409,Central Station,2449.0,55.658239,12.605434,Skotlands Plads
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381.0,113852,2381.0,55.688937,12.562486,Ravnsborggade,2381.0,55.688937,12.562486,Ravnsborggade
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513.0,113912,1513.0,55.682558,12.580462,Møntergade,1513.0,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337.0,113822,2337.0,55.670289,12.565058,København H - Bus Stops,2337.0,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233.0,113881,2153.0,55.680517,12.587455,Nyhavn,233.0,55.668475,12.557384,Høkerboderne


In [28]:
# drop id and idEnd columns
rentals.drop(['id', 'idEnd'], axis=1, inplace=True)

In [29]:
rentals.info()  

<class 'pandas.core.frame.DataFrame'>
Index: 255990 entries, 0 to 261188
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   StartTime       255990 non-null  object 
 1   EndTime         255990 non-null  object 
 2   StartHubId      255990 non-null  int64  
 3   EndHubId        255990 non-null  float64
 4   UserId          255990 non-null  int64  
 5   latitudeStart   255990 non-null  float64
 6   longitudeStart  255990 non-null  float64
 7   nameStart       255990 non-null  object 
 8   latitudeEnd     255990 non-null  float64
 9   longitudeEnd    255990 non-null  float64
 10  nameEnd         255990 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 23.4+ MB


In [30]:
rentals.head()

Unnamed: 0,StartTime,EndTime,StartHubId,EndHubId,UserId,latitudeStart,longitudeStart,nameStart,latitudeEnd,longitudeEnd,nameEnd
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449.0,108186,55.67344,12.564409,Central Station,55.658239,12.605434,Skotlands Plads
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381.0,113852,55.688937,12.562486,Ravnsborggade,55.688937,12.562486,Ravnsborggade
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513.0,113912,55.682558,12.580462,Møntergade,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337.0,113822,55.670289,12.565058,København H - Bus Stops,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233.0,113881,55.680517,12.587455,Nyhavn,55.668475,12.557384,Høkerboderne


In [31]:
# for the same nameStart verify if there are different StartHubId
rentals.groupby('nameStart')['StartHubId'].nunique().sort_values(ascending=False).head()

nameStart
Badstuestræde     2
Adelgade II       2
Kvægtorvsgade     2
Sønderborggade    2
Knabrostræde      2
Name: StartHubId, dtype: int64

In [32]:
# inspect how many values are there in rentals with same dropoff and pickup hub
rentals[rentals.StartHubId == rentals.EndHubId].shape

(61111, 11)

In [33]:
# set StartTime and EndTime to datetime
rentals['StartTime'] = pd.to_datetime(rentals['StartTime'], format="mixed")
rentals['EndTime'] = pd.to_datetime(rentals['EndTime'], format="mixed")
# set EndHubId to int
rentals['EndHubId'] = rentals['EndHubId'].astype(int)

In [34]:
rentals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 255990 entries, 0 to 261188
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   StartTime       255990 non-null  datetime64[ns]
 1   EndTime         255990 non-null  datetime64[ns]
 2   StartHubId      255990 non-null  int64         
 3   EndHubId        255990 non-null  int32         
 4   UserId          255990 non-null  int64         
 5   latitudeStart   255990 non-null  float64       
 6   longitudeStart  255990 non-null  float64       
 7   nameStart       255990 non-null  object        
 8   latitudeEnd     255990 non-null  float64       
 9   longitudeEnd    255990 non-null  float64       
 10  nameEnd         255990 non-null  object        
dtypes: datetime64[ns](2), float64(4), int32(1), int64(2), object(2)
memory usage: 22.5+ MB


In [35]:
duplicate_rows_new = rentals.duplicated().sum()

In [36]:
duplicate_rows_new

207

In [37]:
# drop duplicates
rentals.drop_duplicates(inplace=True)

In [38]:
# remove leading and trailing whitespaces from nameStart and nameEnd
rentals['nameStart'] = rentals['nameStart'].str.strip().str.title()
rentals['nameEnd'] = rentals['nameEnd'].str.strip().str.title()

In [39]:
# inspect how many values are there in rentals with same dropoff and pickup hub
rentals[rentals.StartHubId == rentals.EndHubId].shape

(61050, 11)

In [40]:
from jellyfish import jaro_winkler_similarity

# Extracting latitude and longitude for each unique station name for both start and end stations
unique_stations_start = rentals[['nameStart', 'latitudeStart', 'longitudeStart']].drop_duplicates(subset=['nameStart'])
unique_stations_end = rentals[['nameEnd', 'latitudeEnd', 'longitudeEnd']].drop_duplicates(subset=['nameEnd'])

# Merging both to have a complete set of unique station names with their respective coordinates
unique_stations_start = unique_stations_start.rename(columns={'nameStart': 'name', 'latitudeStart': 'latitude', 'longitudeStart': 'longitude'})
unique_stations_end = unique_stations_end.rename(columns={'nameEnd': 'name', 'latitudeEnd': 'latitude', 'longitudeEnd': 'longitude'})
unique_stations_combined = pd.concat([unique_stations_start, unique_stations_end]).drop_duplicates(subset=['name']).reset_index(drop=True)

# Displaying the merged unique stations with their coordinates
unique_stations_combined.head()
# Computing pairwise Jaro-Winkler similarities
similarity_pairs = []
for i in range(len(unique_stations_combined)):
    for j in range(i+1, len(unique_stations_combined)):
        similarity = jaro_winkler_similarity(unique_stations_combined['name'][i], unique_stations_combined['name'][j])
        if similarity > 0.85:  # Threshold set to 0.85 to capture high similarities
            similarity_pairs.append((unique_stations_combined['name'][i], unique_stations_combined['name'][j], similarity))

# Sorting pairs by similarity score for easy inspection
sorted_similarity_pairs = sorted(similarity_pairs, key=lambda x: x[2], reverse=True)

sorted_similarity_pairs

[('Overgaden Oven Vandet Ii', 'Overgaden Oven Vandet Iii', 0.992),
 ('Emil Holms Kanal Iii', 'Emil Holms Kanal Ii', 0.99),
 ('Lyshøjgårdsvej Iii', 'Lyshøjgårdsvej Ii', 0.9888888888888889),
 ('Nimbusparken Ii', 'Nimbusparken Iii', 0.9874999999999999),
 ('Vestergade Ii', 'Vestergade Iii', 0.9857142857142858),
 ('Elmegade Iii', 'Elmegade Ii', 0.9833333333333333),
 ('Ørnevej Ii', 'Ørnevej Iii', 0.9818181818181818),
 ('Lygten Ii', 'Lygten Iii', 0.98),
 ('Reffen I', 'Reffen Ii', 0.9777777777777777),
 ('Henrik Harpestrengs Vej Ii', 'Henrik Harpestrengs Vej', 0.9769230769230769),
 ('Dag Hammarskjölds Alle', 'Dag Hammarskjölds Alle Ll', 0.976),
 ('Overgaden Neden Vandet', 'Overgaden Neden Vandet Ii', 0.976),
 ('Overgaden Oven Vandet', 'Overgaden Oven Vandet Ii', 0.975),
 ('Kalkbrænderihavnsgade', 'Kalkbrænderihavnsgade Ii', 0.975),
 ('Jens Otto Krags Gade', 'Jens Otto Krags Gade Ii', 0.9739130434782608),
 ('Nimbusparken Ii', 'Nimbusparken Iv', 0.9733333333333334),
 ('Oehlenschlægersgade', 'Oehl

In [41]:
from haversine import haversine

# We'll create a dictionary for quick access to the latitude and longitude of each station
station_coords = {row['name']: (row['latitude'], row['longitude']) for index, row in unique_stations_combined.iterrows()}

# Calculate the Haversine distance for each pair in the sorted_similarity_pairs
distance_pairs = []
for pair in sorted_similarity_pairs:
    name1, name2, _ = pair
    lat1, lon1 = station_coords[name1]
    lat2, lon2 = station_coords[name2]
    distance = haversine((lat1, lon1), (lat2, lon2), unit='m')
    distance_pairs.append((name1, name2, distance))

# Sort the pairs by distance for easy inspection
sorted_distance_pairs = sorted(distance_pairs, key=lambda x: x[2])
print(len(sorted_distance_pairs))
# display all pairs with distance less than 100m
sorted_distance_pairs = [pair for pair in sorted_distance_pairs if pair[2] <= 175]
sorted_distance_pairs

722


[('Femøren (Metro St.)', 'Femøren St.', 4.343539721228821),
 ('Christianshavn St.', 'Christianshavns Torv', 7.145052439929078),
 ('Solitudvej', 'Solitudevej', 11.4266293856133),
 ('Rådhuspladsen Ii', 'Rådhuspladsen Temp', 17.75528725699629),
 ('Christianshavns Torv', 'Christianshavn', 25.529774260851184),
 ('Adelgade Alternative', 'Adelgade Ii', 28.31838697423755),
 ('Christianshavn St.', 'Christianshavn', 31.160107072740377),
 ('Jarmers Tårn', 'Jarmers Plads', 37.972311632958615),
 ('Danshøj St', 'Danshøj Station Eastside', 39.10263053093604),
 ('Lindevangs Alle', 'Lindevangs', 47.11512244769292),
 ('Vestergade Ii', 'Vestergade Iii', 50.655516177101894),
 ('Langelands Pl. Ii', 'Langelands Pl.', 56.878141400960274),
 ('Erik Ejegods Gade', 'Erik Ejegods Gade Ii', 56.93505344458927),
 ('Vanløse St. Bike Parking', 'Vanløse St.', 63.41902436378586),
 ('Birkegade', 'Birkegade Ii', 64.45309100343475),
 ('Ellebjergvej', 'Ellebjergvej Ii', 65.29599757315623),
 ('Kongens Nytorv St.', 'Kongens N

In [42]:
station_coords

{'Central Station': (55.6734396, 12.5644085),
 'Ravnsborggade': (55.6889369, 12.562486),
 'Møntergade': (55.6825584, 12.5804619),
 'København H - Bus Stops': (55.6702886, 12.5650579),
 'Nyhavn': (55.6805168, 12.5874547),
 'H. C. Andersens Blvd.': (55.6736684, 12.5715505),
 'Den Sorte Plads': (55.7010158, 12.542979),
 'Cykelslangen': (55.6628627, 12.5615546),
 'Reventlowsgade': (55.6723889, 12.5637666),
 'Kongens Nytorv St.': (55.6793107, 12.5851017),
 'Heibergsgade': (55.6791973, 12.5880951),
 'Rantzausgade': (55.6876388, 12.5467025),
 'Gammeltorv': (55.6781857, 12.5718583),
 'Dagmars Plads': (55.6812366, 12.5345716),
 'Forum St': (55.6818014, 12.5521897),
 'Nørreport St. (Depot)': (55.6825968, 12.5712267),
 'Henrik Steffens Vej': (55.6774044, 12.5377849),
 'Gammel Kongvej': (55.6768597, 12.540163),
 'Jægersborggade': (55.6929863, 12.5428972),
 'Københavns Museum': (55.6723678, 12.5537259),
 'Øresundsvej': (55.6581866, 12.6086291),
 'Ny Carlsberg Glyptotek': (55.6725808, 12.5738921),
 

In [43]:
# sort each tuple inside sorted_distance_pairs alphabetically
sorted_distance_pairs = [tuple(sorted(pair[:2])) for pair in sorted_distance_pairs]
sorted_distance_pairs

[('Femøren (Metro St.)', 'Femøren St.'),
 ('Christianshavn St.', 'Christianshavns Torv'),
 ('Solitudevej', 'Solitudvej'),
 ('Rådhuspladsen Ii', 'Rådhuspladsen Temp'),
 ('Christianshavn', 'Christianshavns Torv'),
 ('Adelgade Alternative', 'Adelgade Ii'),
 ('Christianshavn', 'Christianshavn St.'),
 ('Jarmers Plads', 'Jarmers Tårn'),
 ('Danshøj St', 'Danshøj Station Eastside'),
 ('Lindevangs', 'Lindevangs Alle'),
 ('Vestergade Ii', 'Vestergade Iii'),
 ('Langelands Pl.', 'Langelands Pl. Ii'),
 ('Erik Ejegods Gade', 'Erik Ejegods Gade Ii'),
 ('Vanløse St.', 'Vanløse St. Bike Parking'),
 ('Birkegade', 'Birkegade Ii'),
 ('Ellebjergvej', 'Ellebjergvej Ii'),
 ('Kongens Nytorv', 'Kongens Nytorv St.'),
 ('Rørholmsgade', 'Rørholmsgade Ii'),
 ('Overgaden Neden Vandet Ii', 'Overgaden Oven Vandet Ii'),
 ('Holmens Kanal', 'Holmens Kanal Ii'),
 ('Banegårdspladsen', 'Banegårdspladsen West'),
 ('Israels Pl. Ii', 'Israels Plads'),
 ('Kastrup St.', 'Kastrup St. Ii'),
 ('Solbjerg Kirke', 'Solbjergvej Ii'),


In [44]:
"""def remap_name_and_coordinates(row):
    for pair in sorted_distance_pairs:
        if row['nameStart'] == pair[1]:
            row['nameStart'] = pair[0]
            row['latitudeStart'] = station_coords[pair[0]][0]
            row['longitudeStart'] = station_coords[pair[0]][1]
        if row['nameEnd'] == pair[1]:
            row['nameEnd'] = pair[0]
            row['latitudeEnd'] = station_coords[pair[0]][0]
            row['longitudeEnd'] = station_coords[pair[0]][1]
    return row

# Apply the function to rentals
rentals = rentals.apply(remap_name_and_coordinates, axis=1)"""

"def remap_name_and_coordinates(row):\n    for pair in sorted_distance_pairs:\n        if row['nameStart'] == pair[1]:\n            row['nameStart'] = pair[0]\n            row['latitudeStart'] = station_coords[pair[0]][0]\n            row['longitudeStart'] = station_coords[pair[0]][1]\n        if row['nameEnd'] == pair[1]:\n            row['nameEnd'] = pair[0]\n            row['latitudeEnd'] = station_coords[pair[0]][0]\n            row['longitudeEnd'] = station_coords[pair[0]][1]\n    return row\n\n# Apply the function to rentals\nrentals = rentals.apply(remap_name_and_coordinates, axis=1)"

In [45]:
def optimize_remap_name_and_coordinates(rentals, sorted_distance_pairs, station_coords):
    # Create a mapping for names to their standardized version and corresponding coordinates
    name_to_standard = {pair[1]: pair[0] for pair in sorted_distance_pairs}
    coord_map = {name: station_coords[name] for name, _ in sorted_distance_pairs}

    # Vectorized operations to update names
    rentals['nameStart'] = rentals['nameStart'].map(name_to_standard).fillna(rentals['nameStart'])
    rentals['nameEnd'] = rentals['nameEnd'].map(name_to_standard).fillna(rentals['nameEnd'])

    # Vectorized operations to update coordinates
    for name, coords in coord_map.items():
        mask_start = rentals['nameStart'] == name
        rentals.loc[mask_start, 'latitudeStart'] = coords[0]
        rentals.loc[mask_start, 'longitudeStart'] = coords[1]

        mask_end = rentals['nameEnd'] == name
        rentals.loc[mask_end, 'latitudeEnd'] = coords[0]
        rentals.loc[mask_end, 'longitudeEnd'] = coords[1]

    return rentals


In [46]:
rentals = optimize_remap_name_and_coordinates(rentals, sorted_distance_pairs, station_coords)


In [47]:
import numpy as np


"""
# Create a dictionary to map station names to their new names
renaming_map = dict(sorted_distance_pairs)

# Create a boolean mask for rows where nameStart or nameEnd needs to be remapped
mask = rentals['nameStart'].isin(renaming_map.keys()) | rentals['nameEnd'].isin(renaming_map.keys())

# Replace the original columns with the new ones where necessary
rentals.loc[mask, 'nameStart'] = rentals.loc[mask, 'nameStart'].replace(renaming_map)
rentals.loc[mask, 'nameEnd'] = rentals.loc[mask, 'nameEnd'].replace(renaming_map)
rentals.loc[mask, 'latitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[0])
rentals.loc[mask, 'longitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[1])
rentals.loc[mask, 'latitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[0])
rentals.loc[mask, 'longitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[1])
"""

"\n# Create a dictionary to map station names to their new names\nrenaming_map = dict(sorted_distance_pairs)\n\n# Create a boolean mask for rows where nameStart or nameEnd needs to be remapped\nmask = rentals['nameStart'].isin(renaming_map.keys()) | rentals['nameEnd'].isin(renaming_map.keys())\n\n# Replace the original columns with the new ones where necessary\nrentals.loc[mask, 'nameStart'] = rentals.loc[mask, 'nameStart'].replace(renaming_map)\nrentals.loc[mask, 'nameEnd'] = rentals.loc[mask, 'nameEnd'].replace(renaming_map)\nrentals.loc[mask, 'latitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[0])\nrentals.loc[mask, 'longitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[1])\nrentals.loc[mask, 'latitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[0])\nrentals.loc[mask, 'longitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[1])\n"

In [48]:
# Extracting latitude and longitude for each unique station name for both start and end stations
unique_stations_start = rentals[['nameStart', 'latitudeStart', 'longitudeStart']].drop_duplicates(subset=['nameStart'])
unique_stations_end = rentals[['nameEnd', 'latitudeEnd', 'longitudeEnd']].drop_duplicates(subset=['nameEnd'])

# Merging both to have a complete set of unique station names with their respective coordinates
unique_stations_start = unique_stations_start.rename(columns={'nameStart': 'name', 'latitudeStart': 'latitude', 'longitudeStart': 'longitude'})
unique_stations_end = unique_stations_end.rename(columns={'nameEnd': 'name', 'latitudeEnd': 'latitude', 'longitudeEnd': 'longitude'})
unique_stations_combined = pd.concat([unique_stations_start, unique_stations_end]).drop_duplicates(subset=['name']).reset_index(drop=True)

# Displaying the merged unique stations with their coordinates
unique_stations_combined.head()
# Computing pairwise Jaro-Winkler similarities
similarity_pairs = []
for i in range(len(unique_stations_combined)):
    for j in range(i+1, len(unique_stations_combined)):
        similarity = jaro_winkler_similarity(unique_stations_combined['name'][i], unique_stations_combined['name'][j])
        if similarity > 0.85:  # Threshold set to 0.85 to capture high similarities
            similarity_pairs.append((unique_stations_combined['name'][i], unique_stations_combined['name'][j], similarity))

# Sorting pairs by similarity score for easy inspection
sorted_similarity_pairs = sorted(similarity_pairs, key=lambda x: x[2], reverse=True)

sorted_similarity_pairs

[('Overgaden Neden Vandet', 'Overgaden Neden Vandet Ii', 0.976),
 ('Kalkbrænderihavnsgade', 'Kalkbrænderihavnsgade Ii', 0.975),
 ('Oehlenschlægersgade', 'Oehlenschlægersgade Ii', 0.9727272727272728),
 ('Blegdamsvej', 'Blegdamsvej 2', 0.9692307692307692),
 ('P. G. Ramms Alle', 'P. G. Ramms Alle Ii', 0.968421052631579),
 ('Amaliegarden', 'Amaliegade', 0.9666666666666667),
 ('Vesterfælledvej', 'Vesterfælledvej Ii', 0.9666666666666667),
 ('Brøndby Stadion', 'Brøndby Stadion Ii', 0.9666666666666667),
 ('Sankt Annæ Gade', 'Sankt Annæ Gade Ii', 0.9666666666666667),
 ('Rentemestervej', 'Rentemestervej Ii', 0.9647058823529411),
 ('Fredericiagade', 'Fredericiagade Ii', 0.9647058823529411),
 ('Lyshøjgårdsvej Ii', 'Lyshøjgårdsvej', 0.9647058823529411),
 ('Lundtoftegade', 'Lundtoftegade Ii', 0.9625),
 ('Løjtegårdsvej', 'Løjtegårdsvej Ii', 0.9625),
 ('Kirkegårdsvej', 'Kirkegårdsvej Ii', 0.9625),
 ('Dtu Building 421', 'Dtu Building 127', 0.9616666666666668),
 ('Emil Holms Kanal', 'Emil Holms Kanal Ii

In [49]:
# We'll create a dictionary for quick access to the latitude and longitude of each station
station_coords = {row['name']: (row['latitude'], row['longitude']) for index, row in unique_stations_combined.iterrows()}

# Calculate the Haversine distance for each pair in the sorted_similarity_pairs
distance_pairs = []
for pair in sorted_similarity_pairs:
    name1, name2, _ = pair
    lat1, lon1 = station_coords[name1]
    lat2, lon2 = station_coords[name2]
    distance = haversine((lat1, lon1), (lat2, lon2), unit='m')
    distance_pairs.append((name1, name2, distance))

# Sort the pairs by distance for easy inspection
sorted_distance_pairs = sorted(distance_pairs, key=lambda x: x[2])
print(len(sorted_distance_pairs))
# display all pairs with distance less than 100m
sorted_distance_pairs = [pair for pair in sorted_distance_pairs if pair[2] <= 175]
sorted_distance_pairs

498


[('Lygten', 'Lygten Ii', 86.79185127483214),
 ('Nimbusparken', 'Nimbusparken Ii', 93.40571707178691),
 ('Amerika Plads', 'Amerika Have', 95.61153472950306),
 ('Vesterport', 'Ved Vesterport', 118.43069389620781),
 ('Ørnevej', 'Ørnevej Ii', 119.57109163320457),
 ('Hellerup Posthus', 'Hellerup Station (Esthersvej)', 121.71196975928113),
 ('Nyhavn', 'Nyhavn 17', 121.81379177979764),
 ('Vestergade', 'Vestergade Ii', 131.4161663856455),
 ('Elmegade', 'Elmegade Ii', 142.0014007084329),
 ('Refshale', 'Refshalevej', 142.2930518665444),
 ('Overgaden Neden Vandet', 'Overgaden Neden Vandet Ii', 147.05909775373254),
 ('Solbjerg Pl Ii', 'Solbjergvej', 150.90113147019517),
 ('Carlsberg Campus', 'Carlsberg St.', 159.06292180996164)]

In [50]:
# sort each tuple inside sorted_distance_pairs alphabetically
sorted_distance_pairs = [tuple(sorted(pair[:2])) for pair in sorted_distance_pairs]
sorted_distance_pairs

[('Lygten', 'Lygten Ii'),
 ('Nimbusparken', 'Nimbusparken Ii'),
 ('Amerika Have', 'Amerika Plads'),
 ('Ved Vesterport', 'Vesterport'),
 ('Ørnevej', 'Ørnevej Ii'),
 ('Hellerup Posthus', 'Hellerup Station (Esthersvej)'),
 ('Nyhavn', 'Nyhavn 17'),
 ('Vestergade', 'Vestergade Ii'),
 ('Elmegade', 'Elmegade Ii'),
 ('Refshale', 'Refshalevej'),
 ('Overgaden Neden Vandet', 'Overgaden Neden Vandet Ii'),
 ('Solbjerg Pl Ii', 'Solbjergvej'),
 ('Carlsberg Campus', 'Carlsberg St.')]

In [51]:
rentals = optimize_remap_name_and_coordinates(rentals, sorted_distance_pairs, station_coords)


In [52]:
# Create a dictionary to map station names to their new names
'''renaming_map = dict(sorted_distance_pairs)

# Create a boolean mask for rows where nameStart or nameEnd needs to be remapped
mask = rentals['nameStart'].isin(renaming_map.keys()) | rentals['nameEnd'].isin(renaming_map.keys())

# Replace the original columns with the new ones where necessary
rentals.loc[mask, 'nameStart'] = rentals.loc[mask, 'nameStart'].replace(renaming_map)
rentals.loc[mask, 'nameEnd'] = rentals.loc[mask, 'nameEnd'].replace(renaming_map)
rentals.loc[mask, 'latitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[0])
rentals.loc[mask, 'longitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[1])
rentals.loc[mask, 'latitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[0])
rentals.loc[mask, 'longitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[1])'''

"renaming_map = dict(sorted_distance_pairs)\n\n# Create a boolean mask for rows where nameStart or nameEnd needs to be remapped\nmask = rentals['nameStart'].isin(renaming_map.keys()) | rentals['nameEnd'].isin(renaming_map.keys())\n\n# Replace the original columns with the new ones where necessary\nrentals.loc[mask, 'nameStart'] = rentals.loc[mask, 'nameStart'].replace(renaming_map)\nrentals.loc[mask, 'nameEnd'] = rentals.loc[mask, 'nameEnd'].replace(renaming_map)\nrentals.loc[mask, 'latitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[0])\nrentals.loc[mask, 'longitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[1])\nrentals.loc[mask, 'latitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[0])\nrentals.loc[mask, 'longitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[1])"

In [53]:
# all that starts with Nørreport in nameStart and nameEnd, rename to Nørreport St.
rentals.loc[rentals.nameStart.str.contains('Nørreport'), 'nameStart'] = 'Nørreport St.'
rentals.loc[rentals.nameEnd.str.contains('Nørreport'), 'nameEnd'] = 'Nørreport St.'

In [54]:
# for the same nameStart verify if there are different latitudeStart and longitudeStart
different_coord = rentals.groupby('nameStart')['latitudeStart'].nunique().sort_values(ascending=False)
different_coord = different_coord[different_coord > 1]



In [55]:
# for each index in different_coord, apply a function that returns the first latitudeStart and longitudeStart
def get_first_lat_long(name):
    return rentals[rentals.nameStart == name].iloc[0]['latitudeStart'], rentals[rentals.nameStart == name].iloc[0]['longitudeStart']


In [56]:
different_coord

nameStart
Kvægtorvsgade              2
Borgmester Fischers Vej    2
Rolfsvej                   2
Nordatlantens Brygge       2
Allersgade                 2
Borthigsgade               2
Tivoli Hotel               2
Kastruplundgade            2
Blågårdsgade               2
Mågevej                    2
Halvtolv                   2
Sundparken                 2
Adelgade                   2
Cabinn Hotel               2
Bodil Ipsens Vej           2
Holsteinsgade              2
Abel Cathrines Gade        2
Schleppegrellsgade         2
Nørreport St.              2
Gasværksvej                2
Name: latitudeStart, dtype: int64

In [57]:
for i in different_coord.index:
    rentals.loc[rentals.nameStart == i, ['latitudeStart', 'longitudeStart']] = get_first_lat_long(i)

In [58]:
rentals.head()

Unnamed: 0,StartTime,EndTime,StartHubId,EndHubId,UserId,latitudeStart,longitudeStart,nameStart,latitudeEnd,longitudeEnd,nameEnd
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449,108186,55.67344,12.564409,Central Station,55.658239,12.605434,Skotlands Plads
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381,113852,55.687996,12.561522,Ravnsborg,55.687996,12.561522,Ravnsborg
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513,113912,55.682558,12.580462,Møntergade,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337,113822,55.670289,12.565058,København H - Bus Stops,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233,113881,55.680517,12.587455,Nyhavn,55.668475,12.557384,Høkerboderne


In [59]:
# count nr of unique values for nameStart
rentals.nameStart.nunique()

1064

In [60]:
rentals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 255783 entries, 0 to 261188
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   StartTime       255783 non-null  datetime64[ns]
 1   EndTime         255783 non-null  datetime64[ns]
 2   StartHubId      255783 non-null  int64         
 3   EndHubId        255783 non-null  int32         
 4   UserId          255783 non-null  int64         
 5   latitudeStart   255783 non-null  float64       
 6   longitudeStart  255783 non-null  float64       
 7   nameStart       255783 non-null  object        
 8   latitudeEnd     255783 non-null  float64       
 9   longitudeEnd    255783 non-null  float64       
 10  nameEnd         255783 non-null  object        
dtypes: datetime64[ns](2), float64(4), int32(1), int64(2), object(2)
memory usage: 22.4+ MB


In [61]:
rentals.head()

Unnamed: 0,StartTime,EndTime,StartHubId,EndHubId,UserId,latitudeStart,longitudeStart,nameStart,latitudeEnd,longitudeEnd,nameEnd
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449,108186,55.67344,12.564409,Central Station,55.658239,12.605434,Skotlands Plads
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381,113852,55.687996,12.561522,Ravnsborg,55.687996,12.561522,Ravnsborg
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513,113912,55.682558,12.580462,Møntergade,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337,113822,55.670289,12.565058,København H - Bus Stops,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233,113881,55.680517,12.587455,Nyhavn,55.668475,12.557384,Høkerboderne


In [62]:
# calculate haversine distance between each NameStart
from haversine import haversine

# Extracting latitude and longitude for each unique station name for both start and end stations
unique_stations_start = rentals[['nameStart', 'latitudeStart', 'longitudeStart']].drop_duplicates(subset=['nameStart'])
unique_stations_end = rentals[['nameEnd', 'latitudeEnd', 'longitudeEnd']].drop_duplicates(subset=['nameEnd'])

# Merging both to have a complete set of unique station names with their respective coordinates
unique_stations_start = unique_stations_start.rename(columns={'nameStart': 'name', 'latitudeStart': 'latitude', 'longitudeStart': 'longitude'})
unique_stations_end = unique_stations_end.rename(columns={'nameEnd': 'name', 'latitudeEnd': 'latitude', 'longitudeEnd': 'longitude'})
unique_stations_combined = pd.concat([unique_stations_start, unique_stations_end]).drop_duplicates(subset=['name']).reset_index(drop=True)
# create a dictionary for quick access to the latitude and longitude of each station
station_coords = {row['name']: (row['latitude'], row['longitude']) for index, row in unique_stations_combined.iterrows()}

# calculate the haversine distance between keys in station_coords
distance_pairs = []
keys = list(station_coords.keys())
for i, key1 in enumerate(keys[:-1]):  # Exclude the last key because it will have been compared with all others
    for key2 in keys[i+1:]:  # Only compare with stations that haven't been compared with key1
        lat1, lon1 = station_coords[key1]
        lat2, lon2 = station_coords[key2]
        distance = haversine((lat1, lon1), (lat2, lon2), unit='m')
        if distance <= 200:
            distance_pairs.append((key1, key2, distance))



In [63]:
# sort the pairs by distance for easy inspection
sorted_distance_pairs = sorted(distance_pairs, key=lambda x: x[2])

In [64]:
sorted_distance_pairs

[('Lavendelgade', 'Brasserie Royal', 10.372971640243634),
 ('Arendalsgade', 'Ved Classens Have', 12.411686591194647),
 ('Kongens Nytorv', 'Den Kongelige Teater', 17.071966150526617),
 ('Nyhavn', 'Mindeankeret', 17.120221629057596),
 ('Cph Airport', 'Lufthavn Ii', 31.816922925038078),
 ('Amager Strand St.', 'Roselillevej', 35.792644878795485),
 ('Lundtoftegade Ii', 'Bispeenbungen', 37.65711761576419),
 ('Peblinge Dossering', 'Wesselsgade', 39.14449123165206),
 ('Arresøgade', 'Nøddebogade', 40.23291349473647),
 ('Uplandsgade Ii', 'Dalslandsgade', 42.43395649781087),
 ('Købmagergade', 'Klareboderne', 46.22842885992032),
 ('Egilsgade', 'Gunløgsgade', 46.28990644311582),
 ('H. C. Andersens Blvd.', 'Tivoli Corner', 47.35220666425825),
 ('Mysundegade', 'Flensborggade', 48.156527146361974),
 ('Ved Glyptoteket', 'Glypoteket Have', 48.231083605252465),
 ('Dantes Pl.', 'Nationalmuseet', 48.47609240394781),
 ('Andreas Bjorns Gade', 'Burmeistersgade', 49.32434263527039),
 ('Lange-Müllers Gade', 'Vi

In [65]:
import networkx as nx

# Assuming distance_pairs is your list of station pairs with distances
G = nx.Graph()
for station1, station2, distance in distance_pairs:
    if distance <= 100:
        G.add_edge(station1, station2)

# Find the connected components (hubs)
hubs = list(nx.connected_components(G))

# Now hubs is a list of sets, where each set is a connected component


In [66]:
import plotly.graph_objects as go
import networkx as nx
from haversine import haversine

# Assuming 'G' is your graph created from the station data
G = nx.Graph()
# ... (your code to add nodes and edges)
for station1, station2, distance in distance_pairs:
    if distance <= 100:
        G.add_edge(station1, station2, distance=distance)

# Find the connected components (hubs)
hubs = list(nx.connected_components(G))
# Position the nodes using one of NetworkX's layout algorithms
pos = nx.spring_layout(G)  # positions for all nodes

# Create the Plotly figure
fig = go.Figure()

# Add edges as lines
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    distance = edge[2]['distance']
    fig.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines',
                             line=dict(width=2, color='blue'),
                             hoverinfo='text',
                             hovertext=f"{edge[0]} - {edge[1]}: {distance:.2f} meters"
                             ))

# Add nodes as scatter points
node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

fig.add_trace(go.Scatter(x=node_x, y=node_y, mode='markers', 
                         marker=dict(size=10, color='red', line=dict(width=2)),
                         text=list(G.nodes()), hoverinfo='text'))

# Set the layout of the figure
fig.update_layout(showlegend=False, hovermode='closest', width=1200, height=700,
                  margin=dict(b=0,l=0,r=0,t=0),
                  xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))

# 
# The figure is interactive and can be displayed in a Jupyter notebook or saved as an HTML file
fig.show()


In [74]:
from pyvis.network import Network
import networkx as nx

# Create a graph from the station data
G = nx.Graph()

# Add edges to the graph based on your distance pairs
for station1, station2, distance in distance_pairs:
    if distance <= 100:
        G.add_edge(station1, station2, weight=distance, title=f"{distance:.2f} meters")

# Initialize the Pyvis network with custom options
nt = Network(notebook=True, height="600px", width="100%", heading='')

# Use the positions from spring_layout for better appearance
positions = nx.spring_layout(G)
nx.set_node_attributes(G, positions, 'pos')

# Add nodes and edges to the Pyvis network
for node, pos in positions.items():
    nt.add_node(node, title=str(node), x=pos[0]*1000, y=pos[1]*1000)

for edge in G.edges(data=True):
    nt.add_edge(edge[0], edge[1], title=edge[2]['title'])

# Customization options for the network
nt.set_options("""
{
  "nodes": {
    "borderWidthSelected": 2,
    "color": {
      "border": "rgba(0,97,216,1)",
      "background": "rgba(0,97,216,0.8)",
      "highlight": {
        "border": "rgba(255,0,0,1)",
        "background": "rgba(255,0,0,0.8)"
      }
    },
    "font": {
      "color": "#ffffff"
    }
  },
  "edges": {
    "color": {
      "color": "rgba(129,129,129,0.8)",
      "highlight": "rgba(255,0,0,1)"
    },
    "hoverWidth": 1.5
  },
  "interaction": {
    "dragNodes": true,
    "hover": true,
    "tooltipDelay": 30
  },
  "physics": {
    "enabled": true,
    "barnesHut": {
      "gravitationalConstant": -20000,
      "centralGravity": 0.3,
      "springLength": 100,
      "springConstant": 0.05,
      "damping": 0.09,
      "avoidOverlap": 0.1
    },
    "minVelocity": 0.75,
    "solver": "barnesHut",
    "timestep": 0.5
  }
}
""")

# Show the network
nt.show("stations_network.html")


Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 


In [68]:
hubs

[{'Banegårdspladsen',
  'Central Station',
  'Tourist Information Office',
  'Vesterbrogade Temp'},
 {'Baggesensgade', 'Blågårdsgade', 'Ravnsborg'},
 {'Gammel Mønt', 'Møntergade'},
 {'Dgi Byen', 'København H - Bus Stops', 'Tietgensgade'},
 {'Charlottenborg Slot (Cph:Dox)',
  'Den Kongelige Teater',
  'Heibergsgade',
  'Herluf Trolles Gade',
  'Kongens Nytorv',
  'Mindeankeret',
  'Nyhavn',
  'Store Strandstræde',
  'Tordenskjoldsgade'},
 {'Badstuestræde',
  'Bag Rådhuset',
  'Brasserie Royal',
  'Brolæggerstræde',
  'Farvergade',
  'Gammel Strand',
  'Glypoteket Have',
  'H. C. Andersens Blvd.',
  'Knabrostræde',
  'Lavendelgade',
  'Mambo',
  'Mikkel Bryggers Gade',
  'Naboløs',
  'Nygade',
  'Nytorv',
  'Politikens Hus',
  'Regnbuepladsen',
  'Rådhuspladsen',
  'Rådhusstræde',
  'Tivoli Corner',
  'Vandkunsten',
  'Ved Glyptoteket',
  'Vestergade'},
 {'Kvægtorvsgade',
  'Maria Kirkeplads',
  'Reventlowsgade',
  'Stampesgade',
  'Urban House'},
 {'Christian Winthers Vej', 'Dagmars Pla

In [69]:
from math import pi

# Earth's radius in meters
earth_radius = 6371000

# Epsilon value in meters
eps_meters = 400

# Conversion of epsilon from meters to radians
eps_radians = eps_meters / (2 * pi * earth_radius)


from sklearn.cluster import DBSCAN
import numpy as np

# Assuming station_coords is a dict with station names as keys and coordinates as values
coords = np.array(list(station_coords.values()))

# DBSCAN expects the data to be in a NumPy array of shape (n_samples, n_features)
# In this case, our features are the latitude and longitude
db = DBSCAN(eps=eps_radians, min_samples=1, metric='haversine').fit(np.radians(coords))

# The labels_ attribute contains the cluster labels for each point
clusters = db.labels_

# To get a list of stations for each hub
hub_stations = {}
for station, cluster in zip(station_coords.keys(), clusters):
    if cluster not in hub_stations:
        hub_stations[cluster] = []
    hub_stations[cluster].append(station)

# Now hub_stations is a dict where each key is a cluster label and each value is a list of station names in that cluster


In [70]:
hub_stations

{0: ['Central Station'],
 1: ['Ravnsborg'],
 2: ['Møntergade'],
 3: ['København H - Bus Stops'],
 4: ['Nyhavn',
  'Charlottenborg Slot (Cph:Dox)',
  'Mindeankeret',
  'Store Strandstræde'],
 5: ['H. C. Andersens Blvd.', 'Tivoli Corner'],
 6: ['Den Sorte Plads'],
 7: ['Cykelslangen'],
 8: ['Reventlowsgade'],
 9: ['Kongens Nytorv', 'Den Kongelige Teater'],
 10: ['Heibergsgade'],
 11: ['Rantzausgade'],
 12: ['Gammeltorv'],
 13: ['Dagmars Plads', 'Christian Winthers Vej'],
 14: ['Forum St'],
 15: ['Nørreport St.'],
 16: ['Henrik Steffens Vej'],
 17: ['Gammel Kongvej'],
 18: ['Jægersborggade'],
 19: ['Københavns Museum'],
 20: ['Øresundsvej'],
 21: ['Ny Carlsberg Glyptotek'],
 22: ['Skibbroen'],
 23: ['Christianshavn'],
 24: ['Dronning Louises Bro'],
 25: ['Frederikssundsvej'],
 26: ['Jemtelandsgade'],
 27: ['Marmorbroen'],
 28: ['Sankt Thomas Alle'],
 29: ['Dybbølsgade'],
 30: ['Lygten'],
 31: ['Vega'],
 32: ['Gasværksvej', 'Vesterbros Torv'],
 33: ['Dr. Abildgaards Alle'],
 34: ['Dantes P

In [79]:
import folium

# Assuming station_coords is a dictionary like:
# station_coords = {'Station1': (lat1, lon1), 'Station2': (lat2, lon2), ...}

# Create a map object centered on an approximate central point of your coordinates
average_lat = sum(lat for lat, _ in station_coords.values()) / len(station_coords)
average_lon = sum(lon for _, lon in station_coords.values()) / len(station_coords)
mymap = folium.Map(location=[average_lat, average_lon], zoom_start=13)

# Add markers for each station
for station, (lat, lon) in station_coords.items():
    folium.Marker([lat, lon], popup=station).add_to(mymap)

# Draw lines between each pair of stations within the specified distance
for station1, station2, distance in distance_pairs:
    if distance <= 100:  # or any other threshold you set
        if station1 in station_coords and station2 in station_coords:
            loc1 = station_coords[station1]
            loc2 = station_coords[station2]
            folium.PolyLine([loc1, loc2], color='blue', weight=2.5, opacity=1).add_to(mymap)
        else:
            print(f"Station coordinates not found for: {station1} or {station2}")

# Show the map
mymap


In [71]:
# drop StartHubId and EndHubId
rentals.drop(['StartHubId', 'EndHubId'], axis=1, inplace=True)

In [72]:
# save rentals to csv
rentals.to_csv('../data/processed/donkey_rentals.csv', index=False)