In [1]:
import pandas as pd
import folium
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime
from itertools import islice
import os
import time
from selenium import webdriver

In [2]:
from pylab import rcParams
rcParams['figure.figsize'] = 16, 9

In [3]:
url = "clust_data_nextbike_2020-06cc.csv"
df = pd.read_csv(url, sep=';', parse_dates=['ride_begin', 'ride_end'], low_memory=False) #added the parameter to parse dates

In [4]:
pd.set_option('display.max_rows()', None)

In [5]:
def drop_duplicates(l):
    '''Droping duplicates from a list while maintaining order'''
    seen = set()
    seen_add = seen.add
    return [x for x in l if not (x in seen or seen_add(x))]

In [6]:
def trim_coordinates(column_name):
    return [round(x, 5) for x in df[column_name]]

In [7]:
df = df.drop(['bike', 'bike_racks', 'bikes', 'booked_bikes', 'free_racks', 'free_special_racks', 'maintenance', 'number', 
              'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
              'country_code', 'company', 'timezone', 'available_bikes'], axis=1)

In [8]:
df.head(10)

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time_minutes,distance,uid_begin,uid_end,address,name
0,50.96738,6.913591,50.968653,6.909912,2020-06-01 11:23:02.135730,2020-06-01 11:52:01.971278,29.0,294.0,30468631,30480098,,BIKE 21261
1,50.968653,6.909912,50.972235,6.924658,2020-06-05 17:07:01.701984,2020-06-05 19:36:02.939119,149.0,1107.0,30480098,30654954,,BIKE 21261
2,50.972235,6.924658,50.954284,6.919008,2020-06-05 21:19:02.341096,2020-06-05 23:04:02.169224,105.0,2035.0,30654954,30665132,,BIKE 21261
3,50.954284,6.919008,50.955787,6.914857,2020-06-05 23:30:02.456473,2020-06-06 04:53:01.679870,323.0,335.0,30665132,30669192,,BIKE 21261
4,50.955787,6.914857,50.932989,6.92249,2020-06-06 09:38:02.380791,2020-06-06 09:58:01.499404,20.0,2592.0,30669192,30673723,,BIKE 21261
5,50.932989,6.92249,50.936811,6.922821,2020-06-06 14:32:02.513259,2020-06-06 15:13:01.452866,41.0,426.0,30673723,30683067,,BIKE 21261
6,50.936811,6.922821,50.936704,6.963254,2020-06-06 16:22:02.042554,2020-06-06 17:20:02.849924,58.0,2834.0,30683067,30690361,,BIKE 21261
7,50.936704,6.963254,50.935375,6.956512,2020-06-06 17:25:01.986203,2020-06-06 17:31:02.264822,6.0,495.0,30690361,30691326,,BIKE 21261
8,50.935375,6.956512,50.937103,6.96217,2020-06-06 17:37:02.155707,2020-06-06 17:44:01.614297,7.0,441.0,30691326,30691955,,BIKE 21261
9,50.937103,6.96217,50.90601,6.956364,2020-06-06 19:57:01.447900,2020-06-06 20:29:01.743729,32.0,3482.0,30691955,30699364,,BIKE 21261


In [9]:
df.info() #checking if the timestamp was parsed correctly on import

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71566 entries, 0 to 71565
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   lat_begin          71566 non-null  float64       
 1   lng_begin          71566 non-null  float64       
 2   lat_end            71566 non-null  float64       
 3   lng_end            71566 non-null  float64       
 4   ride_begin         71566 non-null  datetime64[ns]
 5   ride_end           71566 non-null  datetime64[ns]
 6   ride_time_minutes  71566 non-null  float64       
 7   distance           71566 non-null  float64       
 8   uid_begin          71566 non-null  int64         
 9   uid_end            71566 non-null  int64         
 10  address            351 non-null    object        
 11  name               71566 non-null  object        
dtypes: datetime64[ns](2), float64(6), int64(2), object(2)
memory usage: 6.6+ MB


In [10]:
df['lat_begin'] = df['lat_begin'].apply(lambda x: round(x, 5))
df['lng_begin'] = df['lng_begin'].apply(lambda x: round(x, 5))
df['lat_end'] = df['lat_end'].apply(lambda x: round(x, 5))
df['lng_end'] = df['lng_end'].apply(lambda x: round(x, 5))

In [11]:
df['month_b'] = df.ride_begin.apply(lambda x: x.month)
df['week_b'] = df.ride_begin.apply(lambda x: x.week)
df['day_b'] = df.ride_begin.apply(lambda x: x.day)
df['hour_b'] = df.ride_begin.apply(lambda x: x.hour)

In [12]:
df.head()

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time_minutes,distance,uid_begin,uid_end,address,name,month_b,week_b,day_b,hour_b
0,50.96738,6.91359,50.96865,6.90991,2020-06-01 11:23:02.135730,2020-06-01 11:52:01.971278,29.0,294.0,30468631,30480098,,BIKE 21261,6,23,1,11
1,50.96865,6.90991,50.97223,6.92466,2020-06-05 17:07:01.701984,2020-06-05 19:36:02.939119,149.0,1107.0,30480098,30654954,,BIKE 21261,6,23,5,17
2,50.97223,6.92466,50.95428,6.91901,2020-06-05 21:19:02.341096,2020-06-05 23:04:02.169224,105.0,2035.0,30654954,30665132,,BIKE 21261,6,23,5,21
3,50.95428,6.91901,50.95579,6.91486,2020-06-05 23:30:02.456473,2020-06-06 04:53:01.679870,323.0,335.0,30665132,30669192,,BIKE 21261,6,23,5,23
4,50.95579,6.91486,50.93299,6.92249,2020-06-06 09:38:02.380791,2020-06-06 09:58:01.499404,20.0,2592.0,30669192,30673723,,BIKE 21261,6,23,6,9


In [13]:
def generateBaseMap(default_location=[50.937373, 6.954983], default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [14]:
cologne_map_pickup = generateBaseMap()
cologne_map_dropoff = generateBaseMap()
cologne_time_map = generateBaseMap()

In [15]:
df_copy = df.copy()
df_copy['count'] = 1 #adding simple counter for heatmap generation in the following steps

In [16]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71566 entries, 0 to 71565
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   lat_begin          71566 non-null  float64       
 1   lng_begin          71566 non-null  float64       
 2   lat_end            71566 non-null  float64       
 3   lng_end            71566 non-null  float64       
 4   ride_begin         71566 non-null  datetime64[ns]
 5   ride_end           71566 non-null  datetime64[ns]
 6   ride_time_minutes  71566 non-null  float64       
 7   distance           71566 non-null  float64       
 8   uid_begin          71566 non-null  int64         
 9   uid_end            71566 non-null  int64         
 10  address            351 non-null    object        
 11  name               71566 non-null  object        
 12  month_b            71566 non-null  int64         
 13  week_b             71566 non-null  int64         
 14  day_b 

In [17]:
#creating heatmap grouped by pickup locations
HeatMap(data=df_copy[['lat_begin', 'lng_begin', 'count']].groupby(['lat_begin', 'lng_begin']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(cologne_map_pickup)
cologne_map_pickup

In [18]:
#creating heatmap grouped by dropoff locations
HeatMap(data=df_copy[['lat_end', 'lng_end', 'count']].groupby(['lat_end', 'lng_end']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(cologne_map_dropoff)
cologne_map_dropoff

In [19]:
df_hour_list = []
for hour in df_copy.hour_b.sort_values().unique():
    df_hour_list.append(df_copy.loc[df_copy.hour_b == hour, ['lat_begin', 'lng_begin', 'count']].groupby(['lat_begin', 'lng_begin']).sum().reset_index().values.tolist())

In [20]:
HeatMapWithTime(df_hour_list, radius=5, gradient={0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}, min_opacity=0.5, max_opacity=0.8, use_local_extrema=True).add_to(cologne_time_map)
cologne_time_map