In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../src/')

In [2]:
import folium
import random
import pandas as pd

In [3]:
availabilities = pd.read_csv('../data/nextbike_availability.csv')
availabilities

Unnamed: 0,nextbike_availability_at_station,nextbike_availability_vehicle_id,nextbike_availability_geometry,nextbike_availability_status,nextbike_availability_valid_from,nextbike_availability_valid_till
0,False,6045,POINT(6.923261165618896 50.92481231689453),available,2022-02-07 12:01:00,2022-02-07 12:31:00
1,False,5987,POINT(6.981607913970947 50.94987869262695),available,2022-03-12 15:01:00,2022-03-12 15:01:00
2,False,5987,POINT(6.981634140014648 50.94987869262695),available,2022-03-12 15:06:00,2022-03-12 16:31:00
3,False,5987,POINT(6.956296920776367 50.94599533081055),available,2022-03-12 16:51:00,2022-03-12 16:56:00
4,False,5987,POINT(6.956347942352295 50.94609069824219),available,2022-03-12 17:01:00,2022-03-12 17:01:00
...,...,...,...,...,...,...
8329914,False,28428,POINT(6.954068183898926 50.94620132446289),available,2023-08-25 11:11:00,2023-08-25 12:26:00
8329915,False,28428,POINT(6.953924179077148 50.9462776184082),available,2023-08-25 12:31:00,2023-08-25 12:31:00
8329916,False,28428,POINT(6.954151153564453 50.94615936279297),available,2023-08-25 12:36:00,2023-08-25 17:11:00
8329917,False,28428,POINT(6.924293041229248 50.9334716796875),available,2023-08-25 17:31:00,2023-08-25 17:51:00


## Availabilities Data Preparation

In [4]:
availabilities.columns = availabilities.columns.str.replace('nextbike_availability_', '')

In [5]:
availabilities["lon"] = (
    availabilities.geometry.apply(lambda x: x.split(" ")[0])
    .apply(lambda x: x.split("(")[1])
    .astype(float)
)
availabilities["lat"] = (
    availabilities.geometry.apply(lambda x: x.split(" ")[1])
    .apply(lambda x: x.split(")")[0])
    .astype(float)
)


In [6]:
availabilities["valid_from"] = pd.to_datetime(availabilities["valid_from"])
availabilities["valid_till"] = pd.to_datetime(availabilities["valid_till"])

In [7]:
from package import h3

In [8]:
availabilities = h3.add_h3_cell_id_to_df_with_batching(
    availabilities, 8, n_batches=16 * 10
)


  return bound(*args, **kwds)


## Spatial and Temporal Discretization 

In [9]:
def get_locations_at_time(df, time):
	return df[(df.valid_from <= time) & (df.valid_till >= time)]

In [10]:
buckets = pd.date_range(availabilities.valid_from.min(), availabilities.valid_till.max(), freq='1H')
buckets

DatetimeIndex(['2022-01-15 00:01:00', '2022-01-15 01:01:00',
               '2022-01-15 02:01:00', '2022-01-15 03:01:00',
               '2022-01-15 04:01:00', '2022-01-15 05:01:00',
               '2022-01-15 06:01:00', '2022-01-15 07:01:00',
               '2022-01-15 08:01:00', '2022-01-15 09:01:00',
               ...
               '2023-08-31 14:01:00', '2023-08-31 15:01:00',
               '2023-08-31 16:01:00', '2023-08-31 17:01:00',
               '2023-08-31 18:01:00', '2023-08-31 19:01:00',
               '2023-08-31 20:01:00', '2023-08-31 21:01:00',
               '2023-08-31 22:01:00', '2023-08-31 23:01:00'],
              dtype='datetime64[ns]', length=14256, freq='H')

In [11]:
import multiprocessing
from tqdm.auto import tqdm
from package import key

all_hex_ids = availabilities.h3_cell_id.unique()


def count_bikes_at_time(time_point):
    locations_at_time = get_locations_at_time(availabilities, time_point)
    
    n_bikes_per_hex = locations_at_time.groupby("h3_cell_id").size()
    n_bikes_per_hex = n_bikes_per_hex.reindex(all_hex_ids, fill_value=0)
    n_bikes_per_hex.name = time_point
    return n_bikes_per_hex


with multiprocessing.Pool(key.DEFAULT_N_PROCESSES) as pool:
    n_bikes_per_hex_per_time = list(
        tqdm(pool.imap(count_bikes_at_time, buckets), total=len(buckets))
    )


  0%|          | 0/14256 [00:00<?, ?it/s]

In [12]:
n_bikes_per_hex_per_time = pd.DataFrame(n_bikes_per_hex_per_time)
n_bikes_per_hex_per_time.index.name = "time"


In [13]:
n_bikes_per_hex_per_time.head()

h3_cell_id,881fa199abfffff,881fa18a4dfffff,881fa1999dfffff,881fa19999fffff,881fa199a1fffff,881fa199d3fffff,881fa199c3fffff,881fa199c1fffff,881fa199e1fffff,881fa199a9fffff,...,881fa18027fffff,881fa18f2dfffff,881fa18305fffff,881fa18133fffff,881fa1812dfffff,881fa18855fffff,881fa19a85fffff,881fa18909fffff,881fa19ab1fffff,881fa181e5fffff
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-15 00:01:00,8,3,35,36,17,11,26,21,5,24,...,0,0,0,0,0,0,0,0,0,0
2022-01-15 01:01:00,7,3,35,36,20,10,27,17,5,23,...,0,0,0,0,0,0,0,0,0,0
2022-01-15 02:01:00,9,3,36,33,20,10,27,17,4,28,...,0,0,0,0,0,0,0,0,0,0
2022-01-15 03:01:00,10,3,34,33,23,9,29,19,3,28,...,0,0,0,0,0,0,0,0,0,0
2022-01-15 04:01:00,12,3,34,34,23,9,29,21,3,29,...,0,0,0,0,0,0,0,0,0,0


In [17]:
import pandas as pd
from sklearn_extra.cluster import KMedoids

# Number of clusters you want
n_clusters = 4

model = KMedoids(n_clusters=n_clusters, random_state=4711)
model.fit(n_bikes_per_hex_per_time)

# n_bikes_per_hex_per_time["cluster"] = model.labels_


In [18]:
medoids = n_bikes_per_hex_per_time.iloc[model.medoid_indices_]

In [31]:
from package.geometa import GeoMeta
geo_meta_path = "../data/geometa.pkl"
geo_meta = GeoMeta.load(geo_meta_path)

In [32]:
cologne_center = [50.938361, 6.959974]
m = folium.Map(location=cologne_center, zoom_start=12)
geo_meta.add_to_folium_map(m)
h3.plot_h3_cells_on_folium(medoids.iloc[0].to_dict(), m, popup_callback=lambda x, y: x)
m

In [20]:
time = medoids.index[3]
locations_at_time = get_locations_at_time(availabilities, time)
time

Timestamp('2022-07-18 06:01:00')

In [22]:
# center in cologne
m = folium.Map(location=[50.938361, 6.959974], zoom_start=13)

colors = ["red", "blue", "green", "yellow"]

for i ,time in enumerate(medoids.index):
	locations_at_time = get_locations_at_time(availabilities, time)
	for point in locations_at_time[['lat', 'lon']].values:
		folium.CircleMarker(
			location=point,
			radius=2,
			color=colors[i],
			fill=True,
			fill_color='#000000'
		).add_to(m)


m

In [None]:
medoids.index

DatetimeIndex(['2022-11-02 16:01:00', '2022-11-01 09:01:00',
               '2022-04-19 17:01:00', '2022-07-18 06:01:00'],
              dtype='datetime64[ns]', name='time', freq=None)

In [None]:
import os

In [None]:
import re


def derive_filename(s) -> str:
    s = re.sub(r"[^a-zA-Z0-9\-_.]", "_", str(s))
    s = s.replace(" ", "_")
    s = re.sub(r"_+", "_", s)
    return s


In [None]:
directory = "../data/bicycle_locations"
os.makedirs(directory, exist_ok=True)
for time in medoids.index:
	locations_at_time = get_locations_at_time(availabilities, time)
	filename = derive_filename(time) + ".csv"
	file_path = os.path.join(directory, filename)
	locations_at_time[["lat", "lon"]].to_csv(file_path, index=False)