In [2]:
# In order to make this script working is necessary 
# to have installed the python packages listed below.
# In particular for awkde is possible to insall it with:

# git clone https://github.com/mennthor/awkde
# pip install [--user] [-e] ./awkde

# For more details please consult awkde github page at: https://github.com/mennthor/awkde

In [7]:
# Please notice: the dataset attached to this script consist only a subset of the entire dataset. In
# particular the first six months of 2017, for weekday 23 time slot.

#Also, on this dataset for the seek of brevity are already applied but not reported
# all the preprocessing steps including temporal and spatial one, in order to obtain data
# ready to be fitted by the model

In [5]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from awkde import GaussianKDE

In [6]:
dataset = pd.read_csv('Amsterdam_dataset_WE23.csv')

In [11]:
# eventhough also WGS84 coordinate system is available, UTM ones are preferred
# because it's the best choice to use default euclidean metric distance for GridSearchCV

In [21]:
kde_columns_utm_2d = ['start_longitude_utm', 'start_latitude_utm']
#start longitude and latitude are used for 2 dimensional KDE
#it's also possible to use end point(final destination)
# kde_columns_utm_2d = ['end_longitude_utm', 'end_latitude_utm']
kde_columns_utm_4d = ['start_longitude_utm', 'start_latitude_utm', 'end_longitude_utm', 'end_latitude_utm']
bandwidths_utm = np.concatenate((np.linspace(0.5,100, 50), np.geomspace(100, 10000, 50)))

In [23]:
#optimal fixed bandwidth reaserarch in two dimensional space
grid_search_fixedKDE_2D =  GridSearchCV(KernelDensity(kernel='gaussian'),
                                    {'bandwidth': bandwidths_utm}, cv=10, n_jobs=-1).fit(dataset[kde_columns_utm_2d].dropna())

In [24]:
log_likelihood_results_fixedKDE_2D = pd.DataFrame(grid_search_fixedKDE_2D.cv_results_['params'])
log_likelihood_results_fixedKDE_2D['mean_likelihood_score'] = grid_search_fixedKDE_2D.cv_results_['mean_test_score']

In [34]:
#it's now possible to fit a 2D fixed KDE with the optimal bandwidth
fixedKDE_2D = KernelDensity(kernel='gaussian',bandwidth=grid_search_fixedKDE_2D.best_params_['bandwidth']).fit(
    dataset[kde_columns_utm_2d])

In [25]:
#optimal fixed bandwidth reaserarch in four dimensional space
grid_search_fixedKDE_4D = GridSearchCV(KernelDensity(kernel='gaussian'),
                                    {'bandwidth': bandwidths_utm}, cv=10, n_jobs=-1).fit(dataset[kde_columns_utm_4d].dropna())

In [26]:
log_likelihood_results_fixedKDE_4D = pd.DataFrame(grid_search_fixedKDE_4D.cv_results_['params'])
log_likelihood_results_fixedKDE_4D['mean_likelihood_score'] = grid_search_fixedKDE_4D.cv_results_['mean_test_score']

In [35]:
#it's now possible to fit a 4D fixed KDE with the optimal bandwidth
fixedKDE_4D = KernelDensity(kernel='gaussian',bandwidth=grid_search_fixedKDE_4D.best_params_['bandwidth']).fit(
    dataset[kde_columns_utm_4d])

In [27]:
######################################################################
#                  VARIABLE BANDWIDTH (ADAPTIVE) KDE                 #
######################################################################

In [29]:
# since the AwKDE package applies a transformation on the input data
# in order to return a sample space with zero mean vector an
# identity covariance matrix. As also the bandwidth has to be scaled
# therefore for the global_bw research we start from the optimal founded with
# scikit-learn library and divide it for the sample's average standard deviation in
#diffent dimensions.

In [28]:
average_std = np.sqrt(np.sum([np.power(dataset['start_longitude_utm'].std(),2),
                              np.power(dataset['start_latitude_utm'].std(),2)])/2)

In [31]:
global_bw_std = grid_search_fixedKDE_2D.best_params_['bandwidth']/average_std
global_bw_ranges = np.concatenate((
                    np.linspace(global_bw_std/5, global_bw_std*5, 90),
                    np.geomspace(global_bw_std*5, 10, 10)
                        ))

In [36]:
#it's possible to find the optimal global bandwidth with AwKDE and then fit a VKDE
# with alpha=0.5 according to Silverman/Abramson Law

In [45]:
grid_search_2D_VKDE_alphaNone = GridSearchCV(GaussianKDE(diag_cov=False, alpha=None),
                                                        {'glob_bw': global_bw_ranges},
                                                      cv=10, n_jobs=-1).fit(dataset[kde_columns_utm_2d].dropna())

In [46]:
VKDE_2D_alpha05 = GaussianKDE(alpha=0.5, 
                              glob_bw=grid_search_2D_VKDE_alphaNone.best_params_['glob_bw'])
VKDE_2D_alpha05.fit(dataset[kde_columns_utm_2d])

(start_longitude_utm    121303.331469
 start_latitude_utm     486393.683503
 dtype: float64,
 array([[4656834.09117353, -961766.58112438],
        [-961766.58112438, 3665726.16457077]]))

In [39]:
# or instead is possible to perform a cross validation  
# with both glob_bw and alpha hyperparamters

In [47]:
grid_search_2D_VKDE = GridSearchCV(GaussianKDE(diag_cov=False),
                                                        {'glob_bw': global_bw_ranges,
                                                        'alpha':[None, 0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
                                                      cv=10, n_jobs=-1).fit(dataset[kde_columns_utm_2d].dropna())

In [48]:
VKDE_2D = GaussianKDE(alpha=grid_search_2D_VKDE.best_params_['alpha'], 
                      glob_bw=grid_search_2D_VKDE.best_params_['glob_bw'])
VKDE_2D.fit(dataset[kde_columns_utm_2d])

(start_longitude_utm    121303.331469
 start_latitude_utm     486393.683503
 dtype: float64,
 array([[4656834.09117353, -961766.58112438],
        [-961766.58112438, 3665726.16457077]]))

In [44]:
#################################################
#the same can be replicated in four dimensions
#################################################

In [51]:
grid_search_4D_VKDE_alphaNone = GridSearchCV(GaussianKDE(diag_cov=False, alpha=None),
                                                        {'glob_bw': global_bw_ranges},
                                                      cv=10, n_jobs=-1).fit(dataset[kde_columns_utm_4d].dropna())

  array_means[:, np.newaxis]) ** 2,


In [52]:
VKDE_4D_alpha05 = GaussianKDE(alpha=0.5, 
                              glob_bw=grid_search_4D_VKDE_alphaNone.best_params_['glob_bw'])
VKDE_4D_alpha05.fit(dataset[kde_columns_utm_4d])

(start_longitude_utm    121303.331469
 start_latitude_utm     486393.683503
 end_longitude_utm      121734.844101
 end_latitude_utm       486103.000174
 dtype: float64,
 array([[4656834.09117353, -961766.58112438, 1325613.14115149,
         -298552.87996605],
        [-961766.58112438, 3665726.16457077, -243796.73036927,
          555690.24327406],
        [1325613.14115149, -243796.73036927, 8547872.86124695,
         -991202.28692073],
        [-298552.87996605,  555690.24327406, -991202.28692073,
         5417474.50794197]]))

In [53]:
grid_search_4D_VKDE = GridSearchCV(GaussianKDE(diag_cov=False),
                                                        {'glob_bw': global_bw_ranges,
                                                        'alpha':[None, 0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
                                                      cv=10, n_jobs=-1).fit(dataset[kde_columns_utm_4d].dropna())

  array_means[:, np.newaxis]) ** 2,


In [54]:
VKDE_4D = GaussianKDE(alpha=grid_search_4D_VKDE.best_params_['alpha'], 
                      glob_bw=grid_search_4D_VKDE.best_params_['glob_bw'])
VKDE_4D.fit(dataset[kde_columns_utm_4d])

(start_longitude_utm    121303.331469
 start_latitude_utm     486393.683503
 end_longitude_utm      121734.844101
 end_latitude_utm       486103.000174
 dtype: float64,
 array([[4656834.09117353, -961766.58112438, 1325613.14115149,
         -298552.87996605],
        [-961766.58112438, 3665726.16457077, -243796.73036927,
          555690.24327406],
        [1325613.14115149, -243796.73036927, 8547872.86124695,
         -991202.28692073],
        [-298552.87996605,  555690.24327406, -991202.28692073,
         5417474.50794197]]))