In [748]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import datetime
# see all columns
pd.set_option('display.max_columns', None)
# see all rows
pd.set_option('display.max_rows', None)
# set seaborn style
sns.set_style('whitegrid')
# set context to notebook
sns.set_context('notebook')


In [749]:
# import data

# set path
path = '../data/raw/DonkeyRepublic/'

hubs = pd.read_excel(path + 'Hubs_2019-4-2_1201.xlsx', parse_dates=['created_at'])
rentals = pd.read_excel(path + 'Rentals_2019-4-2_1456.xlsx', parse_dates=['created_at', 'finished_at'])

In [750]:
# check data
hubs.head()


Unnamed: 0,created_at,latitude,longitude,id,name,deleted_at
0,2018-05-21 09:35:00.697175,48.864936,2.310624,3268,Cours la Reine,2018-05-21 16:17:24.722283
1,2018-11-26 09:06:05.590590,55.695252,12.547185,6367,Heinesgade,
2,2018-06-05 12:12:28.639837,55.676916,12.564896,3642,Concert Hall Pumpehuset,2018-11-15 10:45:34.468962
3,2018-05-28 20:07:34.173984,55.6687,12.551114,3526,Saxogade,
4,2018-11-08 09:36:50.440822,55.699557,12.515065,6233,GrÃ¸ndal Multicenter,


In [751]:
hubs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   created_at  1266 non-null   datetime64[ns]
 1   latitude    1266 non-null   float64       
 2   longitude   1266 non-null   float64       
 3   id          1266 non-null   int64         
 4   name        1266 non-null   object        
 5   deleted_at  119 non-null    object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 59.5+ KB


In [752]:
# remove deleted_at column
hubs.drop('deleted_at', axis=1, inplace=True)

In [753]:
hubs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   created_at  1266 non-null   datetime64[ns]
 1   latitude    1266 non-null   float64       
 2   longitude   1266 non-null   float64       
 3   id          1266 non-null   int64         
 4   name        1266 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 49.6+ KB


In [754]:
rentals.head()

Unnamed: 0,created_at,finished_at,pickup_hub_id,dropoff_hub_id,user_id
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449.0,108186
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381.0,113852
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513.0,113912
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337.0,113822
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233.0,113881


In [755]:
rentals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279860 entries, 0 to 279859
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   created_at      279860 non-null  object 
 1   finished_at     279408 non-null  object 
 2   pickup_hub_id   279860 non-null  int64  
 3   dropoff_hub_id  261293 non-null  float64
 4   user_id         279860 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 10.7+ MB


In [756]:
# print unique values for id in hubs and for pickup_hub_id and dropoff_hub_id in rentals
print('Hubs id unique values: ', hubs.id.nunique())
print('Pickup hub id unique values: ', rentals.pickup_hub_id.nunique())
print('Dropoff hub id unique values: ', rentals.dropoff_hub_id.nunique())

Hubs id unique values:  1266
Pickup hub id unique values:  1244
Dropoff hub id unique values:  1227


In [757]:
# count nr of nan values in dropoff_hub_id
print('Nr of nan values in dropoff_hub_id: ', rentals.dropoff_hub_id.isna().sum())

Nr of nan values in dropoff_hub_id:  18567


In [758]:
# print dataframe head for rentals with nan values in dropoff_hub_id
rentals[rentals.dropoff_hub_id.isna()].head()

Unnamed: 0,created_at,finished_at,pickup_hub_id,dropoff_hub_id,user_id
24,2018-03-03 11:50:26.336953,2018-03-05 09:54:43.594933,2251,,113998
47,2018-03-07 15:07:37.999381,2018-03-08 09:14:28.575214,2281,,114435
62,2018-03-10 11:00:33.847099,2018-03-10 13:54:22.669873,2227,,27688
118,2018-03-09 15:30:12.161583,2018-03-12 10:00:32.844345,152,,16
132,2018-03-12 10:18:23.874653,2018-03-12 15:45:00.200976,2337,,115368


In [759]:
# see the proportion of nan values in dropoff_hub_id
print('Proportion of nan values in dropoff_hub_id: ', rentals.dropoff_hub_id.isna().sum() / len(rentals))

Proportion of nan values in dropoff_hub_id:  0.0663438862288287


In [760]:
# drop rows with nan values in rentals

rentals.dropna(inplace=True)

In [761]:
rentals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 261189 entries, 0 to 279859
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   created_at      261189 non-null  object 
 1   finished_at     261189 non-null  object 
 2   pickup_hub_id   261189 non-null  int64  
 3   dropoff_hub_id  261189 non-null  float64
 4   user_id         261189 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 12.0+ MB


In [762]:
# inspect how many values are there in rentals with same dropoff and pickup hub
rentals[rentals.pickup_hub_id == rentals.dropoff_hub_id].shape

(61822, 5)

In [763]:
# see rentals time interval
print('Rentals time interval: ', rentals.created_at.min(), rentals.created_at.max())

Rentals time interval:  2018-03-01 08:12:40.421178 2019-04-02 12:39:57.98495


In [764]:
hubs.head()

Unnamed: 0,created_at,latitude,longitude,id,name
0,2018-05-21 09:35:00.697175,48.864936,2.310624,3268,Cours la Reine
1,2018-11-26 09:06:05.590590,55.695252,12.547185,6367,Heinesgade
2,2018-06-05 12:12:28.639837,55.676916,12.564896,3642,Concert Hall Pumpehuset
3,2018-05-28 20:07:34.173984,55.6687,12.551114,3526,Saxogade
4,2018-11-08 09:36:50.440822,55.699557,12.515065,6233,GrÃ¸ndal Multicenter


In [765]:
# in hubs explore names that contain non english characters
hubs[hubs.name.str.contains('[^a-zA-Z0-9\s]')].head()

Unnamed: 0,created_at,latitude,longitude,id,name
4,2018-11-08 09:36:50.440822,55.699557,12.515065,6233,GrÃ¸ndal Multicenter
7,2019-03-14 11:39:00.814359,55.697218,12.584605,7148,SÃ¸nderborggade
8,2018-07-22 15:03:16.438551,55.669671,12.545823,4947,MatthÃ¦usgade
14,2018-11-29 18:53:59.393007,55.616528,12.585753,6411,PilegÃ¥rd Alle
17,2018-12-03 12:31:53.789002,55.637874,12.59054,6434,Hf. Elmebo


In [766]:
# replace Ã¦ with æ in hubs names
# replace Ã¸Ã˜ with ø, Ø in hubs names
# replace Ã¥ with å in hubs names
# replace Ã… with Å in hubs names
# replace Ã© with é in hubs names
# replace Ã¼ with ü in hubs names
# replace Ã¶ with ö in hubs names
# replace non-ascii characters in hubs names
hubs['name'] = hubs['name'].str.replace('Ã¦', 'æ', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã¸', 'ø', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã¥', 'å', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã…', 'Å', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã˜', 'Ø', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã©', 'é', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã¼', 'ü', regex=True)
hubs['name'] = hubs['name'].str.replace('Ã¶', 'ö', regex=True)

# remove entire substring if it is in between [] like [N], [G]...
hubs['name'] = hubs['name'].str.replace('\[[^\]]*\]', '', regex=True)

In [767]:
hubs.head()

Unnamed: 0,created_at,latitude,longitude,id,name
0,2018-05-21 09:35:00.697175,48.864936,2.310624,3268,Cours la Reine
1,2018-11-26 09:06:05.590590,55.695252,12.547185,6367,Heinesgade
2,2018-06-05 12:12:28.639837,55.676916,12.564896,3642,Concert Hall Pumpehuset
3,2018-05-28 20:07:34.173984,55.6687,12.551114,3526,Saxogade
4,2018-11-08 09:36:50.440822,55.699557,12.515065,6233,Grøndal Multicenter


In [768]:
# check how man dropoff_hub_id are not in hubs id
print('Nr of dropoff_hub_id not in hubs id: ', rentals[~rentals.dropoff_hub_id.isin(hubs.id)].dropoff_hub_id.nunique())

Nr of dropoff_hub_id not in hubs id:  36


In [769]:
# merge hubs and rentals on pickup_hub_id
rentals = rentals.merge(hubs[['id', 'latitude', 'longitude', 'name']], how='left', left_on='pickup_hub_id', right_on='id', suffixes=('', 'Start'))
rentals.head()

Unnamed: 0,created_at,finished_at,pickup_hub_id,dropoff_hub_id,user_id,id,latitude,longitude,name
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449.0,108186,2163.0,55.67344,12.564409,Central Station
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381.0,113852,2381.0,55.688937,12.562486,Ravnsborggade
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513.0,113912,1513.0,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337.0,113822,2337.0,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233.0,113881,2153.0,55.680517,12.587455,Nyhavn


In [770]:
# for each unique id in dropoff_hub_id, get the corresponding latitude, longitude and name from hubs
# add these columns to rentals named LatitudeEnd, LongitudeEnd, NameEnd
rentals = rentals.merge(hubs[['id', 'latitude', 'longitude', 'name']], how='left', left_on='dropoff_hub_id', right_on='id', suffixes=('', 'End'))

In [771]:
rentals.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261189 entries, 0 to 261188
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   created_at      261189 non-null  object 
 1   finished_at     261189 non-null  object 
 2   pickup_hub_id   261189 non-null  int64  
 3   dropoff_hub_id  261189 non-null  float64
 4   user_id         261189 non-null  int64  
 5   id              258232 non-null  float64
 6   latitude        258232 non-null  float64
 7   longitude       258232 non-null  float64
 8   name            258232 non-null  object 
 9   idEnd           258209 non-null  float64
 10  latitudeEnd     258209 non-null  float64
 11  longitudeEnd    258209 non-null  float64
 12  nameEnd         258209 non-null  object 
dtypes: float64(7), int64(2), object(4)
memory usage: 25.9+ MB


In [772]:
# drop nan values in rentals
rentals.dropna(inplace=True)

In [773]:
# rename columns, created_at to StartTime, finished_at to EndTime, name to PickupHubName, latitude to LatitudeStart, longitude to LongitudeStart
rentals.rename(columns={'created_at': 'StartTime', 'finished_at': 'EndTime', 'latitude': 'latitudeStart', 
                        'longitude': 'longitudeStart', "pickup_hub_id": "StartHubId", "dropoff_hub_id": "EndHubId",
                        "user_id": "UserId", "name":"nameStart"}, inplace=True)

In [774]:
rentals.head()

Unnamed: 0,StartTime,EndTime,StartHubId,EndHubId,UserId,id,latitudeStart,longitudeStart,nameStart,idEnd,latitudeEnd,longitudeEnd,nameEnd
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449.0,108186,2163.0,55.67344,12.564409,Central Station,2449.0,55.658239,12.605434,Skotlands Plads
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381.0,113852,2381.0,55.688937,12.562486,Ravnsborggade,2381.0,55.688937,12.562486,Ravnsborggade
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513.0,113912,1513.0,55.682558,12.580462,Møntergade,1513.0,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337.0,113822,2337.0,55.670289,12.565058,København H - Bus Stops,2337.0,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233.0,113881,2153.0,55.680517,12.587455,Nyhavn,233.0,55.668475,12.557384,Høkerboderne


In [775]:
# drop id and idEnd columns
rentals.drop(['id', 'idEnd'], axis=1, inplace=True)

In [776]:
rentals.info()  

<class 'pandas.core.frame.DataFrame'>
Index: 255990 entries, 0 to 261188
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   StartTime       255990 non-null  object 
 1   EndTime         255990 non-null  object 
 2   StartHubId      255990 non-null  int64  
 3   EndHubId        255990 non-null  float64
 4   UserId          255990 non-null  int64  
 5   latitudeStart   255990 non-null  float64
 6   longitudeStart  255990 non-null  float64
 7   nameStart       255990 non-null  object 
 8   latitudeEnd     255990 non-null  float64
 9   longitudeEnd    255990 non-null  float64
 10  nameEnd         255990 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 23.4+ MB


In [777]:
rentals.head()

Unnamed: 0,StartTime,EndTime,StartHubId,EndHubId,UserId,latitudeStart,longitudeStart,nameStart,latitudeEnd,longitudeEnd,nameEnd
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449.0,108186,55.67344,12.564409,Central Station,55.658239,12.605434,Skotlands Plads
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381.0,113852,55.688937,12.562486,Ravnsborggade,55.688937,12.562486,Ravnsborggade
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513.0,113912,55.682558,12.580462,Møntergade,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337.0,113822,55.670289,12.565058,København H - Bus Stops,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233.0,113881,55.680517,12.587455,Nyhavn,55.668475,12.557384,Høkerboderne


In [778]:
# for the same nameStart verify if there are different StartHubId
rentals.groupby('nameStart')['StartHubId'].nunique().sort_values(ascending=False).head()

nameStart
Badstuestræde     2
Adelgade II       2
Kvægtorvsgade     2
Sønderborggade    2
Knabrostræde      2
Name: StartHubId, dtype: int64

In [779]:
# inspect how many values are there in rentals with same dropoff and pickup hub
rentals[rentals.StartHubId == rentals.EndHubId].shape

(61111, 11)

In [780]:
# set StartTime and EndTime to datetime
rentals['StartTime'] = pd.to_datetime(rentals['StartTime'], format="mixed")
rentals['EndTime'] = pd.to_datetime(rentals['EndTime'], format="mixed")
# set EndHubId to int
rentals['EndHubId'] = rentals['EndHubId'].astype(int)

In [781]:
rentals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 255990 entries, 0 to 261188
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   StartTime       255990 non-null  datetime64[ns]
 1   EndTime         255990 non-null  datetime64[ns]
 2   StartHubId      255990 non-null  int64         
 3   EndHubId        255990 non-null  int32         
 4   UserId          255990 non-null  int64         
 5   latitudeStart   255990 non-null  float64       
 6   longitudeStart  255990 non-null  float64       
 7   nameStart       255990 non-null  object        
 8   latitudeEnd     255990 non-null  float64       
 9   longitudeEnd    255990 non-null  float64       
 10  nameEnd         255990 non-null  object        
dtypes: datetime64[ns](2), float64(4), int32(1), int64(2), object(2)
memory usage: 22.5+ MB


In [782]:
duplicate_rows_new = rentals.duplicated().sum()

In [783]:
duplicate_rows_new

207

In [784]:
# drop duplicates
rentals.drop_duplicates(inplace=True)

In [785]:
# remove leading and trailing whitespaces from nameStart and nameEnd
rentals['nameStart'] = rentals['nameStart'].str.strip().str.title()
rentals['nameEnd'] = rentals['nameEnd'].str.strip().str.title()

In [786]:
# inspect how many values are there in rentals with same dropoff and pickup hub
rentals[rentals.StartHubId == rentals.EndHubId].shape

(61050, 11)

In [787]:
from jellyfish import jaro_winkler_similarity

# Extracting latitude and longitude for each unique station name for both start and end stations
unique_stations_start = rentals[['nameStart', 'latitudeStart', 'longitudeStart']].drop_duplicates(subset=['nameStart'])
unique_stations_end = rentals[['nameEnd', 'latitudeEnd', 'longitudeEnd']].drop_duplicates(subset=['nameEnd'])

# Merging both to have a complete set of unique station names with their respective coordinates
unique_stations_start = unique_stations_start.rename(columns={'nameStart': 'name', 'latitudeStart': 'latitude', 'longitudeStart': 'longitude'})
unique_stations_end = unique_stations_end.rename(columns={'nameEnd': 'name', 'latitudeEnd': 'latitude', 'longitudeEnd': 'longitude'})
unique_stations_combined = pd.concat([unique_stations_start, unique_stations_end]).drop_duplicates(subset=['name']).reset_index(drop=True)

# Displaying the merged unique stations with their coordinates
unique_stations_combined.head()
# Computing pairwise Jaro-Winkler similarities
similarity_pairs = []
for i in range(len(unique_stations_combined)):
    for j in range(i+1, len(unique_stations_combined)):
        similarity = jaro_winkler_similarity(unique_stations_combined['name'][i], unique_stations_combined['name'][j])
        if similarity > 0.85:  # Threshold set to 0.85 to capture high similarities
            similarity_pairs.append((unique_stations_combined['name'][i], unique_stations_combined['name'][j], similarity))

# Sorting pairs by similarity score for easy inspection
sorted_similarity_pairs = sorted(similarity_pairs, key=lambda x: x[2], reverse=True)

sorted_similarity_pairs

[('Overgaden Oven Vandet Ii', 'Overgaden Oven Vandet Iii', 0.992),
 ('Emil Holms Kanal Iii', 'Emil Holms Kanal Ii', 0.99),
 ('Lyshøjgårdsvej Iii', 'Lyshøjgårdsvej Ii', 0.9888888888888889),
 ('Nimbusparken Ii', 'Nimbusparken Iii', 0.9874999999999999),
 ('Vestergade Ii', 'Vestergade Iii', 0.9857142857142858),
 ('Elmegade Iii', 'Elmegade Ii', 0.9833333333333333),
 ('Ørnevej Ii', 'Ørnevej Iii', 0.9818181818181818),
 ('Lygten Ii', 'Lygten Iii', 0.98),
 ('Reffen I', 'Reffen Ii', 0.9777777777777777),
 ('Henrik Harpestrengs Vej Ii', 'Henrik Harpestrengs Vej', 0.9769230769230769),
 ('Dag Hammarskjölds Alle', 'Dag Hammarskjölds Alle Ll', 0.976),
 ('Overgaden Neden Vandet', 'Overgaden Neden Vandet Ii', 0.976),
 ('Overgaden Oven Vandet', 'Overgaden Oven Vandet Ii', 0.975),
 ('Kalkbrænderihavnsgade', 'Kalkbrænderihavnsgade Ii', 0.975),
 ('Jens Otto Krags Gade', 'Jens Otto Krags Gade Ii', 0.9739130434782608),
 ('Nimbusparken Ii', 'Nimbusparken Iv', 0.9733333333333334),
 ('Oehlenschlægersgade', 'Oehl

In [788]:
from haversine import haversine

# We'll create a dictionary for quick access to the latitude and longitude of each station
station_coords = {row['name']: (row['latitude'], row['longitude']) for index, row in unique_stations_combined.iterrows()}

# Calculate the Haversine distance for each pair in the sorted_similarity_pairs
distance_pairs = []
for pair in sorted_similarity_pairs:
    name1, name2, _ = pair
    lat1, lon1 = station_coords[name1]
    lat2, lon2 = station_coords[name2]
    distance = haversine((lat1, lon1), (lat2, lon2), unit='m')
    distance_pairs.append((name1, name2, distance))

# Sort the pairs by distance for easy inspection
sorted_distance_pairs = sorted(distance_pairs, key=lambda x: x[2])
print(len(sorted_distance_pairs))
# display all pairs with distance less than 100m
sorted_distance_pairs = [pair for pair in sorted_distance_pairs if pair[2] <= 175]
sorted_distance_pairs

722


[('Femøren (Metro St.)', 'Femøren St.', 4.343539721228821),
 ('Christianshavn St.', 'Christianshavns Torv', 7.145052439929078),
 ('Solitudvej', 'Solitudevej', 11.4266293856133),
 ('Rådhuspladsen Ii', 'Rådhuspladsen Temp', 17.75528725699629),
 ('Christianshavns Torv', 'Christianshavn', 25.529774260851184),
 ('Adelgade Alternative', 'Adelgade Ii', 28.31838697423755),
 ('Christianshavn St.', 'Christianshavn', 31.160107072740377),
 ('Jarmers Tårn', 'Jarmers Plads', 37.972311632958615),
 ('Danshøj St', 'Danshøj Station Eastside', 39.10263053093604),
 ('Lindevangs Alle', 'Lindevangs', 47.11512244769292),
 ('Vestergade Ii', 'Vestergade Iii', 50.655516177101894),
 ('Langelands Pl. Ii', 'Langelands Pl.', 56.878141400960274),
 ('Erik Ejegods Gade', 'Erik Ejegods Gade Ii', 56.93505344458927),
 ('Vanløse St. Bike Parking', 'Vanløse St.', 63.41902436378586),
 ('Birkegade', 'Birkegade Ii', 64.45309100343475),
 ('Ellebjergvej', 'Ellebjergvej Ii', 65.29599757315623),
 ('Kongens Nytorv St.', 'Kongens N

In [789]:
station_coords

{'Central Station': (55.6734396, 12.5644085),
 'Ravnsborggade': (55.6889369, 12.562486),
 'Møntergade': (55.6825584, 12.5804619),
 'København H - Bus Stops': (55.6702886, 12.5650579),
 'Nyhavn': (55.6805168, 12.5874547),
 'H. C. Andersens Blvd.': (55.6736684, 12.5715505),
 'Den Sorte Plads': (55.7010158, 12.542979),
 'Cykelslangen': (55.6628627, 12.5615546),
 'Reventlowsgade': (55.6723889, 12.5637666),
 'Kongens Nytorv St.': (55.6793107, 12.5851017),
 'Heibergsgade': (55.6791973, 12.5880951),
 'Rantzausgade': (55.6876388, 12.5467025),
 'Gammeltorv': (55.6781857, 12.5718583),
 'Dagmars Plads': (55.6812366, 12.5345716),
 'Forum St': (55.6818014, 12.5521897),
 'Nørreport St. (Depot)': (55.6825968, 12.5712267),
 'Henrik Steffens Vej': (55.6774044, 12.5377849),
 'Gammel Kongvej': (55.6768597, 12.540163),
 'Jægersborggade': (55.6929863, 12.5428972),
 'Københavns Museum': (55.6723678, 12.5537259),
 'Øresundsvej': (55.6581866, 12.6086291),
 'Ny Carlsberg Glyptotek': (55.6725808, 12.5738921),
 

In [790]:
# sort each tuple inside sorted_distance_pairs alphabetically
sorted_distance_pairs = [tuple(sorted(pair[:2])) for pair in sorted_distance_pairs]
sorted_distance_pairs

[('Femøren (Metro St.)', 'Femøren St.'),
 ('Christianshavn St.', 'Christianshavns Torv'),
 ('Solitudevej', 'Solitudvej'),
 ('Rådhuspladsen Ii', 'Rådhuspladsen Temp'),
 ('Christianshavn', 'Christianshavns Torv'),
 ('Adelgade Alternative', 'Adelgade Ii'),
 ('Christianshavn', 'Christianshavn St.'),
 ('Jarmers Plads', 'Jarmers Tårn'),
 ('Danshøj St', 'Danshøj Station Eastside'),
 ('Lindevangs', 'Lindevangs Alle'),
 ('Vestergade Ii', 'Vestergade Iii'),
 ('Langelands Pl.', 'Langelands Pl. Ii'),
 ('Erik Ejegods Gade', 'Erik Ejegods Gade Ii'),
 ('Vanløse St.', 'Vanløse St. Bike Parking'),
 ('Birkegade', 'Birkegade Ii'),
 ('Ellebjergvej', 'Ellebjergvej Ii'),
 ('Kongens Nytorv', 'Kongens Nytorv St.'),
 ('Rørholmsgade', 'Rørholmsgade Ii'),
 ('Overgaden Neden Vandet Ii', 'Overgaden Oven Vandet Ii'),
 ('Holmens Kanal', 'Holmens Kanal Ii'),
 ('Banegårdspladsen', 'Banegårdspladsen West'),
 ('Israels Pl. Ii', 'Israels Plads'),
 ('Kastrup St.', 'Kastrup St. Ii'),
 ('Solbjerg Kirke', 'Solbjergvej Ii'),


In [791]:
"""def remap_name_and_coordinates(row):
    for pair in sorted_distance_pairs:
        if row['nameStart'] == pair[1]:
            row['nameStart'] = pair[0]
            row['latitudeStart'] = station_coords[pair[0]][0]
            row['longitudeStart'] = station_coords[pair[0]][1]
        if row['nameEnd'] == pair[1]:
            row['nameEnd'] = pair[0]
            row['latitudeEnd'] = station_coords[pair[0]][0]
            row['longitudeEnd'] = station_coords[pair[0]][1]
    return row

# Apply the function to rentals
rentals = rentals.apply(remap_name_and_coordinates, axis=1)"""

"def remap_name_and_coordinates(row):\n    for pair in sorted_distance_pairs:\n        if row['nameStart'] == pair[1]:\n            row['nameStart'] = pair[0]\n            row['latitudeStart'] = station_coords[pair[0]][0]\n            row['longitudeStart'] = station_coords[pair[0]][1]\n        if row['nameEnd'] == pair[1]:\n            row['nameEnd'] = pair[0]\n            row['latitudeEnd'] = station_coords[pair[0]][0]\n            row['longitudeEnd'] = station_coords[pair[0]][1]\n    return row\n\n# Apply the function to rentals\nrentals = rentals.apply(remap_name_and_coordinates, axis=1)"

In [792]:
def optimize_remap_name_and_coordinates(rentals, sorted_distance_pairs, station_coords):
    # Create a mapping for names to their standardized version and corresponding coordinates
    name_to_standard = {pair[1]: pair[0] for pair in sorted_distance_pairs}
    coord_map = {name: station_coords[name] for name, _ in sorted_distance_pairs}

    # Vectorized operations to update names
    rentals['nameStart'] = rentals['nameStart'].map(name_to_standard).fillna(rentals['nameStart'])
    rentals['nameEnd'] = rentals['nameEnd'].map(name_to_standard).fillna(rentals['nameEnd'])

    # Vectorized operations to update coordinates
    for name, coords in coord_map.items():
        mask_start = rentals['nameStart'] == name
        rentals.loc[mask_start, 'latitudeStart'] = coords[0]
        rentals.loc[mask_start, 'longitudeStart'] = coords[1]

        mask_end = rentals['nameEnd'] == name
        rentals.loc[mask_end, 'latitudeEnd'] = coords[0]
        rentals.loc[mask_end, 'longitudeEnd'] = coords[1]

    return rentals


In [793]:
rentals = optimize_remap_name_and_coordinates(rentals, sorted_distance_pairs, station_coords)


In [794]:
import numpy as np


"""
# Create a dictionary to map station names to their new names
renaming_map = dict(sorted_distance_pairs)

# Create a boolean mask for rows where nameStart or nameEnd needs to be remapped
mask = rentals['nameStart'].isin(renaming_map.keys()) | rentals['nameEnd'].isin(renaming_map.keys())

# Replace the original columns with the new ones where necessary
rentals.loc[mask, 'nameStart'] = rentals.loc[mask, 'nameStart'].replace(renaming_map)
rentals.loc[mask, 'nameEnd'] = rentals.loc[mask, 'nameEnd'].replace(renaming_map)
rentals.loc[mask, 'latitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[0])
rentals.loc[mask, 'longitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[1])
rentals.loc[mask, 'latitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[0])
rentals.loc[mask, 'longitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[1])
"""

"\n# Create a dictionary to map station names to their new names\nrenaming_map = dict(sorted_distance_pairs)\n\n# Create a boolean mask for rows where nameStart or nameEnd needs to be remapped\nmask = rentals['nameStart'].isin(renaming_map.keys()) | rentals['nameEnd'].isin(renaming_map.keys())\n\n# Replace the original columns with the new ones where necessary\nrentals.loc[mask, 'nameStart'] = rentals.loc[mask, 'nameStart'].replace(renaming_map)\nrentals.loc[mask, 'nameEnd'] = rentals.loc[mask, 'nameEnd'].replace(renaming_map)\nrentals.loc[mask, 'latitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[0])\nrentals.loc[mask, 'longitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[1])\nrentals.loc[mask, 'latitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[0])\nrentals.loc[mask, 'longitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[1])\n"

In [795]:
# Extracting latitude and longitude for each unique station name for both start and end stations
unique_stations_start = rentals[['nameStart', 'latitudeStart', 'longitudeStart']].drop_duplicates(subset=['nameStart'])
unique_stations_end = rentals[['nameEnd', 'latitudeEnd', 'longitudeEnd']].drop_duplicates(subset=['nameEnd'])

# Merging both to have a complete set of unique station names with their respective coordinates
unique_stations_start = unique_stations_start.rename(columns={'nameStart': 'name', 'latitudeStart': 'latitude', 'longitudeStart': 'longitude'})
unique_stations_end = unique_stations_end.rename(columns={'nameEnd': 'name', 'latitudeEnd': 'latitude', 'longitudeEnd': 'longitude'})
unique_stations_combined = pd.concat([unique_stations_start, unique_stations_end]).drop_duplicates(subset=['name']).reset_index(drop=True)

# Displaying the merged unique stations with their coordinates
unique_stations_combined.head()
# Computing pairwise Jaro-Winkler similarities
similarity_pairs = []
for i in range(len(unique_stations_combined)):
    for j in range(i+1, len(unique_stations_combined)):
        similarity = jaro_winkler_similarity(unique_stations_combined['name'][i], unique_stations_combined['name'][j])
        if similarity > 0.85:  # Threshold set to 0.85 to capture high similarities
            similarity_pairs.append((unique_stations_combined['name'][i], unique_stations_combined['name'][j], similarity))

# Sorting pairs by similarity score for easy inspection
sorted_similarity_pairs = sorted(similarity_pairs, key=lambda x: x[2], reverse=True)

sorted_similarity_pairs

[('Overgaden Neden Vandet', 'Overgaden Neden Vandet Ii', 0.976),
 ('Kalkbrænderihavnsgade', 'Kalkbrænderihavnsgade Ii', 0.975),
 ('Oehlenschlægersgade', 'Oehlenschlægersgade Ii', 0.9727272727272728),
 ('Blegdamsvej', 'Blegdamsvej 2', 0.9692307692307692),
 ('P. G. Ramms Alle', 'P. G. Ramms Alle Ii', 0.968421052631579),
 ('Amaliegarden', 'Amaliegade', 0.9666666666666667),
 ('Vesterfælledvej', 'Vesterfælledvej Ii', 0.9666666666666667),
 ('Brøndby Stadion', 'Brøndby Stadion Ii', 0.9666666666666667),
 ('Sankt Annæ Gade', 'Sankt Annæ Gade Ii', 0.9666666666666667),
 ('Rentemestervej', 'Rentemestervej Ii', 0.9647058823529411),
 ('Fredericiagade', 'Fredericiagade Ii', 0.9647058823529411),
 ('Lyshøjgårdsvej Ii', 'Lyshøjgårdsvej', 0.9647058823529411),
 ('Lundtoftegade', 'Lundtoftegade Ii', 0.9625),
 ('Løjtegårdsvej', 'Løjtegårdsvej Ii', 0.9625),
 ('Kirkegårdsvej', 'Kirkegårdsvej Ii', 0.9625),
 ('Dtu Building 421', 'Dtu Building 127', 0.9616666666666668),
 ('Emil Holms Kanal', 'Emil Holms Kanal Ii

In [796]:
# We'll create a dictionary for quick access to the latitude and longitude of each station
station_coords = {row['name']: (row['latitude'], row['longitude']) for index, row in unique_stations_combined.iterrows()}

# Calculate the Haversine distance for each pair in the sorted_similarity_pairs
distance_pairs = []
for pair in sorted_similarity_pairs:
    name1, name2, _ = pair
    lat1, lon1 = station_coords[name1]
    lat2, lon2 = station_coords[name2]
    distance = haversine((lat1, lon1), (lat2, lon2), unit='m')
    distance_pairs.append((name1, name2, distance))

# Sort the pairs by distance for easy inspection
sorted_distance_pairs = sorted(distance_pairs, key=lambda x: x[2])
print(len(sorted_distance_pairs))
# display all pairs with distance less than 100m
sorted_distance_pairs = [pair for pair in sorted_distance_pairs if pair[2] <= 100]
sorted_distance_pairs

498


[('Lygten', 'Lygten Ii', 86.79185127483214),
 ('Nimbusparken', 'Nimbusparken Ii', 93.40571707178691),
 ('Amerika Plads', 'Amerika Have', 95.61153472950306)]

In [797]:
# sort each tuple inside sorted_distance_pairs alphabetically
sorted_distance_pairs = [tuple(sorted(pair[:2])) for pair in sorted_distance_pairs]
sorted_distance_pairs

[('Lygten', 'Lygten Ii'),
 ('Nimbusparken', 'Nimbusparken Ii'),
 ('Amerika Have', 'Amerika Plads')]

In [798]:
rentals = optimize_remap_name_and_coordinates(rentals, sorted_distance_pairs, station_coords)


In [799]:
# Create a dictionary to map station names to their new names
'''renaming_map = dict(sorted_distance_pairs)

# Create a boolean mask for rows where nameStart or nameEnd needs to be remapped
mask = rentals['nameStart'].isin(renaming_map.keys()) | rentals['nameEnd'].isin(renaming_map.keys())

# Replace the original columns with the new ones where necessary
rentals.loc[mask, 'nameStart'] = rentals.loc[mask, 'nameStart'].replace(renaming_map)
rentals.loc[mask, 'nameEnd'] = rentals.loc[mask, 'nameEnd'].replace(renaming_map)
rentals.loc[mask, 'latitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[0])
rentals.loc[mask, 'longitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[1])
rentals.loc[mask, 'latitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[0])
rentals.loc[mask, 'longitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[1])'''

"renaming_map = dict(sorted_distance_pairs)\n\n# Create a boolean mask for rows where nameStart or nameEnd needs to be remapped\nmask = rentals['nameStart'].isin(renaming_map.keys()) | rentals['nameEnd'].isin(renaming_map.keys())\n\n# Replace the original columns with the new ones where necessary\nrentals.loc[mask, 'nameStart'] = rentals.loc[mask, 'nameStart'].replace(renaming_map)\nrentals.loc[mask, 'nameEnd'] = rentals.loc[mask, 'nameEnd'].replace(renaming_map)\nrentals.loc[mask, 'latitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[0])\nrentals.loc[mask, 'longitudeStart'] = rentals.loc[mask, 'nameStart'].map(station_coords).apply(lambda x: x[1])\nrentals.loc[mask, 'latitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[0])\nrentals.loc[mask, 'longitudeEnd'] = rentals.loc[mask, 'nameEnd'].map(station_coords).apply(lambda x: x[1])"

In [800]:
# all that starts with Nørreport in nameStart and nameEnd, rename to Nørreport St.
rentals.loc[rentals.nameStart.str.contains('Nørreport'), 'nameStart'] = 'Nørreport St.'
rentals.loc[rentals.nameEnd.str.contains('Nørreport'), 'nameEnd'] = 'Nørreport St.'

In [801]:
# for the same nameStart verify if there are different latitudeStart and longitudeStart
different_coord = rentals.groupby('nameStart')['latitudeStart'].nunique().sort_values(ascending=False)
different_coord = different_coord[different_coord > 1]



In [802]:
# for each index in different_coord, apply a function that returns the first latitudeStart and longitudeStart
def get_first_lat_long(name):
    return rentals[rentals.nameStart == name].iloc[0]['latitudeStart'], rentals[rentals.nameStart == name].iloc[0]['longitudeStart']


In [803]:
different_coord

nameStart
Rolfsvej                   2
Bodil Ipsens Vej           2
Mågevej                    2
Kvægtorvsgade              2
Halvtolv                   2
Allersgade                 2
Blågårdsgade               2
Tivoli Hotel               2
Borthigsgade               2
Nordatlantens Brygge       2
Gasværksvej                2
Kastruplundgade            2
Borgmester Fischers Vej    2
Adelgade                   2
Nørreport St.              2
Cabinn Hotel               2
Schleppegrellsgade         2
Abel Cathrines Gade        2
Holsteinsgade              2
Sundparken                 2
Name: latitudeStart, dtype: int64

In [804]:
for i in different_coord.index:
    rentals.loc[rentals.nameStart == i, ['latitudeStart', 'longitudeStart']] = get_first_lat_long(i)

In [805]:
rentals.head()

Unnamed: 0,StartTime,EndTime,StartHubId,EndHubId,UserId,latitudeStart,longitudeStart,nameStart,latitudeEnd,longitudeEnd,nameEnd
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449,108186,55.67344,12.564409,Central Station,55.658239,12.605434,Skotlands Plads
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381,113852,55.687996,12.561522,Ravnsborg,55.687996,12.561522,Ravnsborg
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513,113912,55.682558,12.580462,Møntergade,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337,113822,55.670289,12.565058,København H - Bus Stops,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233,113881,55.680517,12.587455,Nyhavn,55.668475,12.557384,Høkerboderne


In [806]:
# count nr of unique values for nameStart
rentals.nameStart.nunique()

1074

In [807]:
rentals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 255783 entries, 0 to 261188
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   StartTime       255783 non-null  datetime64[ns]
 1   EndTime         255783 non-null  datetime64[ns]
 2   StartHubId      255783 non-null  int64         
 3   EndHubId        255783 non-null  int32         
 4   UserId          255783 non-null  int64         
 5   latitudeStart   255783 non-null  float64       
 6   longitudeStart  255783 non-null  float64       
 7   nameStart       255783 non-null  object        
 8   latitudeEnd     255783 non-null  float64       
 9   longitudeEnd    255783 non-null  float64       
 10  nameEnd         255783 non-null  object        
dtypes: datetime64[ns](2), float64(4), int32(1), int64(2), object(2)
memory usage: 22.4+ MB


In [808]:
# find nameStart Its Be 
rentals[rentals.nameStart.str.contains('Its Be')]
# remove Its Be from dataset
rentals = rentals[~rentals.nameStart.str.contains('Its Be')]  

In [809]:
rentals.head()

Unnamed: 0,StartTime,EndTime,StartHubId,EndHubId,UserId,latitudeStart,longitudeStart,nameStart,latitudeEnd,longitudeEnd,nameEnd
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449,108186,55.67344,12.564409,Central Station,55.658239,12.605434,Skotlands Plads
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381,113852,55.687996,12.561522,Ravnsborg,55.687996,12.561522,Ravnsborg
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513,113912,55.682558,12.580462,Møntergade,55.682558,12.580462,Møntergade
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337,113822,55.670289,12.565058,København H - Bus Stops,55.670289,12.565058,København H - Bus Stops
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233,113881,55.680517,12.587455,Nyhavn,55.668475,12.557384,Høkerboderne


In [810]:
# calculate haversine distance between each NameStart
from haversine import haversine

# Extracting latitude and longitude for each unique station name for both start and end stations
unique_stations_start = rentals[['nameStart', 'latitudeStart', 'longitudeStart']].drop_duplicates(subset=['nameStart'])
unique_stations_end = rentals[['nameEnd', 'latitudeEnd', 'longitudeEnd']].drop_duplicates(subset=['nameEnd'])

# Merging both to have a complete set of unique station names with their respective coordinates
unique_stations_start = unique_stations_start.rename(columns={'nameStart': 'name', 'latitudeStart': 'latitude', 'longitudeStart': 'longitude'})
unique_stations_end = unique_stations_end.rename(columns={'nameEnd': 'name', 'latitudeEnd': 'latitude', 'longitudeEnd': 'longitude'})
unique_stations_combined = pd.concat([unique_stations_start, unique_stations_end]).drop_duplicates(subset=['name']).reset_index(drop=True)
# create a dictionary for quick access to the latitude and longitude of each station
station_coords = {row['name']: (row['latitude'], row['longitude']) for index, row in unique_stations_combined.iterrows()}

# calculate the haversine distance between keys in station_coords
distance_pairs = []
keys = list(station_coords.keys())
for i, key1 in enumerate(keys[:-1]):  # Exclude the last key because it will have been compared with all others
    for key2 in keys[i+1:]:  # Only compare with stations that haven't been compared with key1
        lat1, lon1 = station_coords[key1]
        lat2, lon2 = station_coords[key2]
        distance = haversine((lat1, lon1), (lat2, lon2), unit='m')
        distance_pairs.append((key1, key2, distance))



In [811]:
# sort the pairs by distance for easy inspection
sorted_distance_pairs = sorted(distance_pairs, key=lambda x: x[2])

In [812]:
sorted_distance_pairs[:10]

[('Lavendelgade', 'Brasserie Royal', 10.372971640243634),
 ('Arendalsgade', 'Ved Classens Have', 12.411686591194647),
 ('Kongens Nytorv', 'Den Kongelige Teater', 17.071966150526617),
 ('Nyhavn', 'Mindeankeret', 17.120221629057596),
 ('Cph Airport', 'Lufthavn Ii', 31.816922925038078),
 ('Amager Strand St.', 'Roselillevej', 35.792644878795485),
 ('Lundtoftegade Ii', 'Bispeenbungen', 37.65711761576419),
 ('Peblinge Dossering', 'Wesselsgade', 39.14449123165206),
 ('Arresøgade', 'Nøddebogade', 40.23291349473647),
 ('Uplandsgade Ii', 'Dalslandsgade', 42.43395649781087)]

In [813]:
import networkx as nx

# Assuming distance_pairs is your list of station pairs with distances
G = nx.Graph()
for station1, station2, distance in distance_pairs:
    if distance <= 150:
        G.add_edge(station1, station2)

# Find the connected components (hubs)
hubs = list(nx.connected_components(G))

# Now hubs is a list of sets, where each set is a connected component


In [814]:
# Full code for hierarchical clustering using the complete linkage method, 
# based on the provided code for calculating haversine distances and the precomputed 'distance_pairs'.

from scipy.cluster.hierarchy import complete, fcluster
from scipy.spatial.distance import squareform

# Assuming 'distance_pairs' is populated with the precomputed haversine distances
# and 'station_coords' is a dictionary with station names and their coordinates

# Use the unique station keys directly for indexing
keys = list(station_coords.keys())

# Create a square matrix with 'inf' as default values
distance_matrix = np.full((len(keys), len(keys)), np.inf)

# Set the diagonal to 0 since the distance from a hub to itself is always 0
np.fill_diagonal(distance_matrix, 0)

# Create a mapping of station names to their indices in the matrix
hub_index = {hub_name: index for index, hub_name in enumerate(keys)}

# Populate the matrix with the distances from 'distance_pairs'
for start, end, distance in distance_pairs:
    index_start = hub_index[start]
    index_end = hub_index[end]
    distance_matrix[index_start, index_end] = distance
    distance_matrix[index_end, index_start] = distance  # Ensure the matrix is symmetric

# Perform complete linkage hierarchical clustering on the condensed distance matrix
# We use 'squareform' to get the condensed distance matrix, which is needed for 'complete'
condensed_distance_matrix = squareform(distance_matrix)
Z = complete(condensed_distance_matrix)

# Form clusters based on a maximum distance of 150 meters
clusters = fcluster(Z, t=200, criterion='distance')

# Map each station to its respective cluster
cluster_labels = {key: cluster for key, cluster in zip(keys, clusters)}

cluster_labels  # This will contain each station's name and its cluster label




{'Central Station': 390,
 'Ravnsborg': 190,
 'Møntergade': 220,
 'København H - Bus Stops': 372,
 'Nyhavn': 402,
 'H. C. Andersens Blvd.': 379,
 'Den Sorte Plads': 132,
 'Cykelslangen': 348,
 'Reventlowsgade': 390,
 'Kongens Nytorv': 404,
 'Heibergsgade': 404,
 'Rantzausgade': 153,
 'Gammeltorv': 313,
 'Dagmars Plads': 160,
 'Forum St': 199,
 'Nørreport St.': 227,
 'Henrik Steffens Vej': 304,
 'Gammel Kongvej': 305,
 'Jægersborggade': 139,
 'Københavns Museum': 366,
 'Øresundsvej': 504,
 'Ny Carlsberg Glyptotek': 378,
 'Skibbroen': 347,
 'Christianshavn': 319,
 'Dronning Louises Bro': 190,
 'Frederikssundsvej': 247,
 'Jemtelandsgade': 499,
 'Marmorbroen': 327,
 'Sankt Thomas Alle': 298,
 'Dybbølsgade': 360,
 'Lygten': 275,
 'Vega': 362,
 'Gasværksvej': 366,
 'Dr. Abildgaards Alle': 159,
 'Dantes Pl.': 326,
 'Dag Hammarskjölds Alle': 445,
 'Guldbergs Pl.': 170,
 'Prags Boulevard': 501,
 'Filosofgangen': 472,
 'Åboulevard': 198,
 'Statens Museum For Kunst': 231,
 'Rådhuspladsen': 314,
 '

In [815]:
# create 2 new columns in rentals, StartHubCluster and EndHubCluster
rentals['StartHubClusterId'] = rentals['nameStart'].map(cluster_labels)
rentals['EndHubClusterId'] = rentals['nameEnd'].map(cluster_labels)

In [817]:
# Create a DataFrame from nameStart and nameEnd columns for counting
start_names = rentals[['nameStart', 'StartHubClusterId']].rename(columns={'nameStart': 'name', 'StartHubClusterId': 'ClusterId'})
end_names = rentals[['nameEnd', 'EndHubClusterId']].rename(columns={'nameEnd': 'name', 'EndHubClusterId': 'ClusterId'})

# Concatenate the two DataFrames
all_names = pd.concat([start_names, end_names])

# Calculate the combined count for each hub name within each cluster
combined_counts = all_names.groupby(['ClusterId', 'name']).size().reset_index(name='count')

# Determine the most frequent hub name for each cluster
def most_frequent_hub(group):
    return group.loc[group['count'].idxmax()]['name']

cluster_names = combined_counts.groupby('ClusterId').apply(most_frequent_hub).reset_index(name='MostFrequentHub')

# Map the most frequent hub name to each rental based on the cluster ID
rentals['StartClusterName'] = rentals['StartHubClusterId'].map(cluster_names.set_index('ClusterId')['MostFrequentHub'])
rentals['EndClusterName'] = rentals['EndHubClusterId'].map(cluster_names.set_index('ClusterId')['MostFrequentHub'])


In [None]:
# create 2 new columns in rentals, StartClusterName and EndClusterName and assign the corresponding cluster name to each cluster id
# based on the most frequent nameStart and nameEnd for each cluster id
#rentals['StartClusterName'] = rentals.groupby('StartHubClusterId')['nameStart'].transform(lambda x: x.value_counts().index[0])
#rentals['EndClusterName'] = rentals.groupby('EndHubClusterId')['nameEnd'].transform(lambda x: x.value_counts().index[0]) 

In [818]:
# create 4 more columns in rentals, StartClusterLatitude, StartClusterLongitude, EndClusterLatitude, EndClusterLongitude
# and assign the corresponding latitude and longitude to each cluster id based on the centroid of the cluster
rentals['StartClusterLatitude'] = rentals.groupby('StartHubClusterId')['latitudeStart'].transform('mean')
rentals['StartClusterLongitude'] = rentals.groupby('StartHubClusterId')['longitudeStart'].transform('mean')
rentals['EndClusterLatitude'] = rentals.groupby('EndHubClusterId')['latitudeEnd'].transform('mean')
rentals['EndClusterLongitude'] = rentals.groupby('EndHubClusterId')['longitudeEnd'].transform('mean')

In [819]:
rentals.head()

Unnamed: 0,StartTime,EndTime,StartHubId,EndHubId,UserId,latitudeStart,longitudeStart,nameStart,latitudeEnd,longitudeEnd,nameEnd,StartHubClusterId,EndHubClusterId,StartClusterName,EndClusterName,StartClusterLatitude,StartClusterLongitude,EndClusterLatitude,EndClusterLongitude
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,2163,2449,108186,55.67344,12.564409,Central Station,55.658239,12.605434,Skotlands Plads,390,505,Central Station,Skotlands Plads,55.673117,12.564212,55.658397,12.605787
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,2381,2381,113852,55.687996,12.561522,Ravnsborg,55.687996,12.561522,Ravnsborg,190,190,Dronning Louises Bro,Dronning Louises Bro,55.687272,12.562037,55.687272,12.562039
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,1513,1513,113912,55.682558,12.580462,Møntergade,55.682558,12.580462,Møntergade,220,220,Møntergade,Møntergade,55.682533,12.579492,55.682535,12.579493
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,2337,2337,113822,55.670289,12.565058,København H - Bus Stops,55.670289,12.565058,København H - Bus Stops,372,372,Tietgensgade,Tietgensgade,55.67006,12.564139,55.670053,12.564123
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,2153,233,113881,55.680517,12.587455,Nyhavn,55.668475,12.557384,Høkerboderne,402,375,Mindeankeret,Kødbyen,55.680611,12.587633,55.668317,12.557507


In [820]:
# see the max value in cluster_labels
max(cluster_labels.values())

591

In [824]:
import folium


# Assuming 'station_coords' is your dictionary of station names and their coordinates
# Assuming 'cluster_labels' is your dictionary of station names and their cluster numbers


# Create a map object, centered on the average coordinates of all stations
average_lat = sum(lat for lat, _ in station_coords.values()) / len(station_coords)
average_lon = sum(lon for _, lon in station_coords.values()) / len(station_coords) 
mymap = folium.Map(location=[average_lat, average_lon], zoom_start=13)

# Add the cluster name on top of the map
# Define a dictionary that maps each cluster number to the corresponding station names
cluster_names_to_stations = {}
for station, cluster_number in cluster_labels.items():
    if cluster_number not in cluster_names_to_stations:
        cluster_names_to_stations[cluster_number] = []
    cluster_names_to_stations[cluster_number].append(station)

# Draw lines between each pair of stations within the same cluster
for (station1, station2, distance) in distance_pairs:
    if cluster_labels[station1] == cluster_labels[station2]:
        loc1 = station_coords[station1]
        loc2 = station_coords[station2]
        cluster_number = cluster_labels[station1]
        line = folium.PolyLine(locations=[loc1, loc2], color='blue', weight=2.5, opacity=1)
        mymap.add_child(line)
        folium.Marker(loc1, icon=folium.Icon(color='red'), popup=station1).add_to(mymap)
        folium.Marker(loc2, icon=folium.Icon(color='red'),popup=station2).add_to(mymap)
        line.add_child(folium.Popup(f'Cluster {cluster_number}'))


# Show the map
mymap


In [None]:
dist_matrix

array([[   0.        , 1628.66242031, 1428.69018842, ..., 2413.39015258,
        2070.26325494, 6332.91417881],
       [1628.66242031,    0.        , 1332.33237037, ..., 1966.08034524,
        2447.90820927, 7594.39911869],
       [1428.69018842, 1332.33237037,    0.        , ..., 3111.03827252,
        1115.59793602, 6373.20241241],
       ...,
       [2413.39015258, 1966.08034524, 3111.03827252, ...,    0.        ,
        4159.39095191, 8717.5799393 ],
       [2070.26325494, 2447.90820927, 1115.59793602, ..., 4159.39095191,
           0.        , 5396.25294087],
       [6332.91417881, 7594.39911869, 6373.20241241, ..., 8717.5799393 ,
        5396.25294087,    0.        ]])

In [None]:
stations.head()

Unnamed: 0,name,latitude,longitude,Cluster
0,Central Station,55.67344,12.564409,430
1,Ravnsborg,55.687996,12.561522,231
2,Møntergade,55.682558,12.580462,491
3,København H - Bus Stops,55.670289,12.565058,441
4,Nyhavn,55.680517,12.587455,513


In [None]:
from math import pi

# Earth's radius in meters
earth_radius = 6371000

# Epsilon value in meters
eps_meters = 400

# Conversion of epsilon from meters to radians
eps_radians = eps_meters / (2 * pi * earth_radius)


from sklearn.cluster import DBSCAN
import numpy as np

# Assuming station_coords is a dict with station names as keys and coordinates as values
coords = np.array(list(station_coords.values()))

# DBSCAN expects the data to be in a NumPy array of shape (n_samples, n_features)
# In this case, our features are the latitude and longitude
db = DBSCAN(eps=eps_radians, min_samples=1, metric='haversine').fit(np.radians(coords))

# The labels_ attribute contains the cluster labels for each point
clusters = db.labels_

# To get a list of stations for each hub
hub_stations = {}
for station, cluster in zip(station_coords.keys(), clusters):
    if cluster not in hub_stations:
        hub_stations[cluster] = []
    hub_stations[cluster].append(station)

# Now hub_stations is a dict where each key is a cluster label and each value is a list of station names in that cluster


In [None]:
hub_stations

{0: ['Central Station'],
 1: ['Ravnsborg'],
 2: ['Møntergade'],
 3: ['København H - Bus Stops'],
 4: ['Nyhavn',
  'Charlottenborg Slot (Cph:Dox)',
  'Mindeankeret',
  'Store Strandstræde'],
 5: ['H. C. Andersens Blvd.', 'Tivoli Corner'],
 6: ['Den Sorte Plads'],
 7: ['Cykelslangen'],
 8: ['Reventlowsgade'],
 9: ['Kongens Nytorv', 'Den Kongelige Teater'],
 10: ['Heibergsgade'],
 11: ['Rantzausgade'],
 12: ['Gammeltorv'],
 13: ['Dagmars Plads', 'Christian Winthers Vej'],
 14: ['Forum St'],
 15: ['Nørreport St.'],
 16: ['Henrik Steffens Vej'],
 17: ['Gammel Kongvej'],
 18: ['Jægersborggade'],
 19: ['Københavns Museum'],
 20: ['Øresundsvej'],
 21: ['Ny Carlsberg Glyptotek'],
 22: ['Skibbroen'],
 23: ['Christianshavn'],
 24: ['Dronning Louises Bro'],
 25: ['Frederikssundsvej'],
 26: ['Jemtelandsgade'],
 27: ['Marmorbroen'],
 28: ['Sankt Thomas Alle'],
 29: ['Dybbølsgade'],
 30: ['Lygten'],
 31: ['Vega'],
 32: ['Gasværksvej', 'Vesterbros Torv'],
 33: ['Dr. Abildgaards Alle'],
 34: ['Dantes P

In [None]:
import folium

# Assuming station_coords is a dictionary like:
# station_coords = {'Station1': (lat1, lon1), 'Station2': (lat2, lon2), ...}

# Create a map object centered on an approximate central point of your coordinates
average_lat = sum(lat for lat, _ in station_coords.values()) / len(station_coords)
average_lon = sum(lon for _, lon in station_coords.values()) / len(station_coords)
mymap = folium.Map(location=[average_lat, average_lon], zoom_start=13)

# Add markers for each station
for station, (lat, lon) in station_coords.items():
    folium.Marker([lat, lon], popup=station).add_to(mymap)

# Draw lines between each pair of stations within the specified distance
for station1, station2, distance in distance_pairs:
    if distance <= 100:  # or any other threshold you set
        if station1 in station_coords and station2 in station_coords:
            loc1 = station_coords[station1]
            loc2 = station_coords[station2]
            tooltip_text = f"{station1} - {station2}: {distance:.2f} meters"
            folium.PolyLine([loc1, loc2], tooltip=tooltip_text, color='blue', weight=2.5, opacity=1).add_to(mymap)
        else:
            print(f"Station coordinates not found for: {station1} or {station2}")

# Show the map
mymap


In [None]:
# drop StartHubId and EndHubId
rentals.drop(['StartHubId', 'EndHubId'], axis=1, inplace=True)

In [None]:
# save rentals to csv
rentals.to_csv('../data/processed/donkey_rentals.csv', index=False)

In [None]:
rentals.head()

Unnamed: 0,StartTime,EndTime,UserId,latitudeStart,longitudeStart,nameStart,latitudeEnd,longitudeEnd,nameEnd,StartHubClusterId,EndHubClusterId,StartClusterName,EndClusterName,StartClusterLatitude,StartClusterLongitude,EndClusterLatitude,EndClusterLongitude
0,2018-03-01 17:43:14.707445,2018-03-01 18:14:12.145,108186,55.67344,12.564409,Central Station,55.658239,12.605434,Skotlands Plads,590,331,Central Station,Skotlands Plads,55.673117,12.564212,55.658397,12.605787
1,2018-03-02 09:55:18.823405,2018-03-02 16:56:35.013,113852,55.687996,12.561522,Ravnsborg,55.687996,12.561522,Ravnsborg,193,193,Dronning Louises Bro,Dronning Louises Bro,55.687272,12.562037,55.687272,12.562039
2,2018-03-02 14:00:10.755516,2018-03-02 17:13:08.047,113912,55.682558,12.580462,Møntergade,55.682558,12.580462,Møntergade,33,33,Møntergade,Møntergade,55.682533,12.579492,55.682535,12.579493
3,2018-03-01 10:25:34.429934,2018-03-02 20:23:46.544,113822,55.670289,12.565058,København H - Bus Stops,55.670289,12.565058,København H - Bus Stops,572,572,Tietgensgade,Tietgensgade,55.67006,12.564139,55.670053,12.564123
4,2018-03-02 08:51:47.459257,2018-03-02 21:18:48.813,113881,55.680517,12.587455,Nyhavn,55.668475,12.557384,Høkerboderne,77,575,Nyhavn,Høkerboderne,55.680611,12.587585,55.668317,12.557507


In [None]:
import requests
import datetime

def get_quickest_itinerary(start_lat, start_lon, end_lat, end_lon, start_datetime, num_itineraries, waitAtBeginningFactor):
    try:
        if not isinstance(start_datetime, datetime.datetime):
            raise ValueError("start_datetime must be a datetime object")

        response = requests.get(
            'http://localhost:8080/otp/routers/cph/plan',
            params={
                'fromPlace': '{},{}'.format(start_lat, start_lon),
                'toPlace': '{},{}'.format(end_lat, end_lon),
                'optimize': 'QUICK',
                'mode' : 'TRANSIT',
                'numItineraries': num_itineraries,
                'waitAtBeginningFactor': waitAtBeginningFactor,
                'date': start_datetime.strftime('%m-%d-%Y'),
                'time': start_datetime.strftime('%H:%M:%S')
            }
        )
        response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
        return response.json()  # Parses the response content as JSON
    except requests.RequestException as e:
        return {"error": str(e)}
    except ValueError as e:
        return {"error": str(e)}

# Example usage:
# result = get_quickest_itinerary(55.6761, 12.5683, 55.6833, 12.5711, datetime.datetime.now(), 3, 0.5)
# print(result)



In [None]:
import datetime
datetime.datetime(2018, 3, 1, 17, 43, 14, 707445)

datetime.datetime(2018, 3, 1, 17, 43, 14, 707445)

In [None]:
response = get_quickest_itinerary(55.673440,12.564409,55.658239, 12.605434, datetime.datetime(2018, 3, 1, 17, 43, 14, 707445), 1, 0 )

In [None]:
type(response)

dict

In [None]:
response

{'requestParameters': {'mode': 'TRANSIT',
  'date': '03-01-2018',
  'optimize': 'QUICK',
  'fromPlace': '55.67344,12.564409',
  'toPlace': '55.658239,12.605434',
  'time': '17:43:14',
  'numItineraries': '1',
  'waitAtBeginningFactor': '0'},
 'plan': {'date': 1519922594000,
  'from': {'name': 'Origin',
   'lon': 12.564409,
   'lat': 55.67344,
   'vertexType': 'NORMAL'},
  'to': {'name': 'Destination',
   'lon': 12.605434,
   'lat': 55.658239,
   'vertexType': 'NORMAL'},
  'itineraries': [{'duration': 3000,
    'startTime': 1519922594000,
    'endTime': 1519925594000,
    'walkTime': 3000,
    'transitTime': 0,
    'waitingTime': 0,
    'walkDistance': 3640.85,
    'walkLimitExceeded': False,
    'generalizedCost': 5665,
    'elevationLost': 0.0,
    'elevationGained': 0.0,
    'transfers': 0,
    'fare': {'fare': {}, 'details': {}},
    'legs': [{'startTime': 1519922594000,
      'endTime': 1519925594000,
      'departureDelay': 0,
      'arrivalDelay': 0,
      'realTime': False,
    

In [None]:
import json

# Assuming 'response' is the JSON response you have
data = response

# Check if itineraries are available
if 'plan' in data and 'itineraries' in data['plan'] and len(data['plan']['itineraries']) > 0:
    itinerary = data['plan']['itineraries'][0]  # First itinerary

    # Extract the desired information
    total_duration = itinerary['duration'] / 60  # Convert milliseconds to minutes
    number_of_transfers = itinerary['transfers']
    walking_time = itinerary['walkTime'] / 60  # Convert seconds to minutes

    # Output the extracted information
    print(f"Total Duration: {total_duration} minutes")
    print(f"Number of Transfers: {number_of_transfers}")
    print(f"Walking Time: {walking_time} minutes")
else:
    print("No itineraries found in the response.")


Total Duration: 50.0 minutes
Number of Transfers: 0
Walking Time: 50.0 minutes


In [None]:
#http://localhost:8080/?module=planner&fromPlace=55.682087406447%2C12.55170700906874&toPlace=55.66751436732039%2C12.609212811557008&time=2%3A16pm&date=11-11-2023&mode=TRANSIT%2CWALK&arriveBy=false&wheelchair=false&showIntermediateStops=true&locale=en&baseLayer=OSM%20Standard%20Tiles

In [None]:
import requests
import datetime

def get_top_itineraries(from_lat, from_lon, to_lat, to_lon, datetime_obj, 
                        mode='TRANSIT,WALK', arrive_by=False, wheelchair=False, 
                        show_intermediate_stops=True, locale='en', num_itineraries=3):
    
    # Format the date and time
    date = datetime_obj.strftime('%Y-%m-%d')  # Format: YYYY-MM-DD
    time = datetime_obj.strftime('%H:%M')     # Format: HH:MM (24-hour)

    # Construct the URL with query parameters
    url = (
        'http://localhost:8080/otp/routers/default/plan'
        '?fromPlace={},{}'
        '&toPlace={},{}'
        '&time={}'
        '&date={}'
        '&mode={}'
        '&arriveBy={}'
        '&wheelchair={}'
        '&showIntermediateStops={}'
        '&locale={}'
        '&numItineraries={}'
        '&optimize=QUICK'  # Ensure the fastest route is prioritized
    ).format(
        from_lat, from_lon, 
        to_lat, to_lon, 
        time, 
        date, 
        mode, 
        str(arrive_by).lower(), 
        str(wheelchair).lower(), 
        str(show_intermediate_stops).lower(), 
        locale,
        num_itineraries
    )

    # Send the request
    response = requests.get(url)
    return response.json()

# Example usage
result = get_top_itineraries(55.680517,	12.587455, 55.668475,	12.557384, datetime.datetime(2023, 11, 11, 14, 16))
# print(result)


In [None]:
# save result to json
import json
with open('../data/processed/donkey_itineraries.json', 'w') as f:
    json.dump(result, f)
    

In [None]:
# Updated function to handle missing 'to' and 'from' keys in legs

def process_itineraries(response):
    itineraries = response["plan"]["itineraries"]
    itinerary_details = []

    for i, itinerary in enumerate(itineraries):
        # Convert duration from seconds to minutes and seconds
        minutes, seconds = divmod(itinerary["duration"], 60)
        # Convert startTime and endTime from Unix timestamp to readable format
        start_time = datetime.datetime.fromtimestamp(itinerary["startTime"] / 1000).strftime('%H:%M')
        end_time = datetime.datetime.fromtimestamp(itinerary["endTime"] / 1000).strftime('%H:%M')

        details = {
            "Itinerary": i + 1,
            "Duration": f"{minutes} minutes and {seconds} seconds",
            "Start Time": start_time,
            "End Time": end_time,
            "Distance": f"{itinerary['walkDistance'] / 1000:.2f} km",  # converting meters to kilometers
            "Steps": []
        }

        for leg in itinerary["legs"]:
            if leg["mode"] == "WALK":
                step_detail = f"Walk"
                if 'to' in leg:
                    step_detail += f" to {leg['to']['name']}"
                step_detail += f": {leg['distance']} meters"
            else:  # For transit modes like BUS, METRO
                step_detail = f"{leg['mode']} from {leg.get('from', {}).get('name', 'Unknown')} to {leg.get('to', {}).get('name', 'Unknown')}: {leg['distance']} meters"

            details["Steps"].append(step_detail)

        itinerary_details.append(details)

    return itinerary_details

# Process the mock response again
itinerary_info = process_itineraries(result)
itinerary_info



[{'Itinerary': 1,
  'Duration': '36 minutes and 51 seconds',
  'Start Time': '14:16',
  'End Time': '14:52',
  'Distance': '2.58 km',
  'Steps': ['Walk to Destination: 2584.05 meters']},
 {'Itinerary': 2,
  'Duration': '22 minutes and 31 seconds',
  'Start Time': '14:16',
  'End Time': '14:39',
  'Distance': '0.99 km',
  'Steps': ['Walk to Kongens Nytorv St. (Metro): 361.09 meters',
   'SUBWAY from Kongens Nytorv St. (Metro) to København H (Metro): 1782.42 meters',
   'Walk to Destination: 632.84 meters']},
 {'Itinerary': 3,
  'Duration': '22 minutes and 31 seconds',
  'Start Time': '14:18',
  'End Time': '14:41',
  'Distance': '0.99 km',
  'Steps': ['Walk to Kongens Nytorv St. (Metro): 361.09 meters',
   'SUBWAY from Kongens Nytorv St. (Metro) to København H (Metro): 1782.42 meters',
   'Walk to Destination: 632.84 meters']}]

In [None]:
def process_itineraries(response):
    itineraries = response["plan"]["itineraries"]
    itinerary_details = []

    for itinerary in itineraries:
        # Convert duration from seconds to minutes and seconds
        minutes, seconds = divmod(itinerary["duration"], 60)
        # Convert startTime and endTime from Unix timestamp to readable format
        start_time = datetime.datetime.fromtimestamp(itinerary["startTime"] / 1000).strftime('%H:%M')
        end_time = datetime.datetime.fromtimestamp(itinerary["endTime"] / 1000).strftime('%H:%M')

        details = {
            "Duration": f"{minutes} minutes and {seconds} seconds",
            "Start Time": start_time,
            "End Time": end_time,
            "Distance": f"{itinerary['walkDistance'] / 1000:.2f} km",  # converting meters to kilometers
            "Steps": [],
            "Duration Seconds": itinerary["duration"]  # For sorting purposes
        }

        for leg in itinerary["legs"]:
            if leg["mode"] == "WALK":
                step_detail = f"Walk"
                if 'to' in leg:
                    step_detail += f" to {leg['to']['name']}"
                step_detail += f": {leg['distance']} meters"
            else:  # For transit modes like BUS, METRO
                step_detail = f"{leg['mode']} from {leg.get('from', {}).get('name', 'Unknown')} to {leg.get('to', {}).get('name', 'Unknown')}: {leg['distance']} meters"

            details["Steps"].append(step_detail)

        itinerary_details.append(details)

    # Sort itineraries by total duration in seconds
    sorted_itineraries = sorted(itinerary_details, key=lambda x: x['Duration Seconds'])

    # Remove the 'Duration Seconds' key as it's no longer needed
    for itinerary in sorted_itineraries:
        itinerary.pop('Duration Seconds', None)

    return sorted_itineraries

# Process the mock response again
itinerary_info = process_itineraries(result)
itinerary_info


[{'Duration': '22 minutes and 31 seconds',
  'Start Time': '14:16',
  'End Time': '14:39',
  'Distance': '0.99 km',
  'Steps': ['Walk to Kongens Nytorv St. (Metro): 361.09 meters',
   'SUBWAY from Kongens Nytorv St. (Metro) to København H (Metro): 1782.42 meters',
   'Walk to Destination: 632.84 meters']},
 {'Duration': '22 minutes and 31 seconds',
  'Start Time': '14:18',
  'End Time': '14:41',
  'Distance': '0.99 km',
  'Steps': ['Walk to Kongens Nytorv St. (Metro): 361.09 meters',
   'SUBWAY from Kongens Nytorv St. (Metro) to København H (Metro): 1782.42 meters',
   'Walk to Destination: 632.84 meters']},
 {'Duration': '36 minutes and 51 seconds',
  'Start Time': '14:16',
  'End Time': '14:52',
  'Distance': '2.58 km',
  'Steps': ['Walk to Destination: 2584.05 meters']}]

In [None]:
import datetime

def format_duration(seconds):
    """Format duration from seconds to minutes and seconds."""
    m, s = divmod(seconds, 60)
    return f"{m} minutes and {s} seconds"

def parse_itineraries(data):
    itineraries = data.get('plan', {}).get('itineraries', [])
    parsed_itineraries = []

    for itinerary in itineraries:
        details = {
            'Duration': format_duration(itinerary['duration']),
            'Start Time': datetime.datetime.fromtimestamp(itinerary['startTime'] / 1000).strftime('%H:%M'),
            'End Time': datetime.datetime.fromtimestamp(itinerary['endTime'] / 1000).strftime('%H:%M'),
            'Distance': f"{itinerary['walkDistance'] / 1000:.2f} km",
            'Steps': [],
            'Duration Seconds': itinerary['duration']  # Add duration in seconds for sorting
        }

        for leg in itinerary['legs']:
            if leg['mode'] in ['WALK', 'BICYCLE']:
                details['Steps'].append(f"{leg['mode'].title()}: {leg['distance']:.2f} meters")
            elif leg['mode'] in ['BUS', 'TRAM', 'SUBWAY', 'RAIL', 'FERRY']:
                route = leg.get('routeShortName', 'Unknown Route')
                details['Steps'].append(f"{leg['mode'].title()} {route} from {leg['from']['name']} to {leg['to']['name']}: {leg['distance']:.2f} meters")

        parsed_itineraries.append(details)

    # Sort itineraries by duration in seconds
    sorted_itineraries = sorted(parsed_itineraries, key=lambda x: x['Duration Seconds'])

    # Remove the 'Duration Seconds' key as it's no longer needed
    for itinerary in sorted_itineraries:
        itinerary.pop('Duration Seconds', None)

    return sorted_itineraries

# Assuming result is the data fetched from the API
parsed_itineraries = parse_itineraries(result)
for index, itinerary in enumerate(parsed_itineraries, start=1):
    print(f"Itinerary {index}:")
    for key, value in itinerary.items():
        print(f"- {key}: {value}")
    print()


Itinerary 1:
- Duration: 22 minutes and 31 seconds
- Start Time: 14:16
- End Time: 14:39
- Distance: 0.99 km
- Steps: ['Walk: 361.09 meters', 'Subway M3 from Kongens Nytorv St. (Metro) to København H (Metro): 1782.42 meters', 'Walk: 632.84 meters']

Itinerary 2:
- Duration: 22 minutes and 31 seconds
- Start Time: 14:18
- End Time: 14:41
- Distance: 0.99 km
- Steps: ['Walk: 361.09 meters', 'Subway M4 from Kongens Nytorv St. (Metro) to København H (Metro): 1782.42 meters', 'Walk: 632.84 meters']

Itinerary 3:
- Duration: 36 minutes and 51 seconds
- Start Time: 14:16
- End Time: 14:52
- Distance: 2.58 km
- Steps: ['Walk: 2584.05 meters']



In [None]:
def process_itineraries_v2(response):
    itineraries = response["plan"]["itineraries"]
    itinerary_details = []

    for itinerary in itineraries:
        total_duration = itinerary["duration"] / 60  # duration in minutes
        total_distance = sum(leg["distance"] for leg in itinerary["legs"]) / 1000  # distance in km

        walking_time = sum(leg["duration"] for leg in itinerary["legs"] if leg["mode"] == "WALK") / 60  # walking time in minutes
        transit_time = total_duration - walking_time  # transit time in minutes

        transfers = sum(1 for i in range(len(itinerary["legs"]) - 1) if itinerary["legs"][i]["mode"] != "WALK" and itinerary["legs"][i + 1]["mode"] != "WALK")

        details = {
            "TotalDurationMin": total_duration,
            "TripDistanceKm": total_distance,
            "TotalWalkingTimeMin": walking_time,
            "TotalTransitTimeMin": transit_time,
            "Changes": transfers
        }

        itinerary_details.append(details)

    # Sort itineraries by total duration and keep only the fastest trip
    sorted_itineraries = sorted(itinerary_details, key=lambda x: x['TotalDurationMin'])[0]

    return sorted_itineraries

# Process the response
fastest_itinerary = process_itineraries_v2(result)
fastest_itinerary



{'TotalDurationMin': 22.516666666666666,
 'TripDistanceKm': 2.7763500000000003,
 'TotalWalkingTimeMin': 18.516666666666666,
 'TotalTransitTimeMin': 4.0,
 'Changes': 0}

In [None]:
'''import pandas as pd
from multiprocessing import Pool
from functools import partial

def fetch_and_process_itinerary(row, num_itineraries):
    try:
        response = get_top_itineraries(
            row['latitudeStart'], row['longitudeStart'],
            row['latitudeEnd'], row['longitudeEnd'],
            row['StartTime'],
            num_itineraries=num_itineraries
        )
        itinerary_info = process_itineraries_v2(response)
        return itinerary_info[0] if itinerary_info else None
    except Exception as e:
        print(f"Error processing row {row.name}: {e}")
        return None

def parallel_process_dataframe(df, num_itineraries, num_processes):
    with Pool(num_processes) as pool:
        results = pool.map(partial(fetch_and_process_itinerary, num_itineraries=num_itineraries), [row for _, row in df.iterrows()])
    return results

# Example usage
num_processes = 8  # Adjust based on your system capabilities
num_itineraries = 1  # Fetching only the fastest itinerary
itinerary_details = parallel_process_dataframe(rentals, num_itineraries, num_processes)

# Add details to DataFrame
for i, details in enumerate(itinerary_details):
    if details:
        rentals.at[i, 'TotalDurationMin'] = details['TotalDurationMin']
        rentals.at[i, 'TripDistanceKm'] = details['TripDistanceKm']
        rentals.at[i, 'TotalWalkingTimeMin'] = details['TotalWalkingTimeMin']
        rentals.at[i, 'TotalTransitTimeMin'] = details['TotalTransitTimeMin']
        rentals.at[i, 'Changes'] = details['Changes']'''


'import pandas as pd\nfrom multiprocessing import Pool\nfrom functools import partial\n\ndef fetch_and_process_itinerary(row, num_itineraries):\n    try:\n        response = get_top_itineraries(\n            row[\'latitudeStart\'], row[\'longitudeStart\'],\n            row[\'latitudeEnd\'], row[\'longitudeEnd\'],\n            row[\'StartTime\'],\n            num_itineraries=num_itineraries\n        )\n        itinerary_info = process_itineraries_v2(response)\n        return itinerary_info[0] if itinerary_info else None\n    except Exception as e:\n        print(f"Error processing row {row.name}: {e}")\n        return None\n\ndef parallel_process_dataframe(df, num_itineraries, num_processes):\n    with Pool(num_processes) as pool:\n        results = pool.map(partial(fetch_and_process_itinerary, num_itineraries=num_itineraries), [row for _, row in df.iterrows()])\n    return results\n\n# Example usage\nnum_processes = 8  # Adjust based on your system capabilities\nnum_itineraries = 1  # 