In [3]:
from utils import data_from_url
import os
import pandas as pd

URL_YELLOW_TAXI_201701 = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv"
DATA_DIRECTORY = "../data"

### Télécharge les données si non présente en local

In [4]:
data_from_url(URL_YELLOW_TAXI_201701, DATA_DIRECTORY)

File already downloaded


### Chargement des données

In [5]:
name = URL_YELLOW_TAXI_201701.rsplit('/', 1)[-1]
columns_selected = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count',
                    'trip_distance', 'pickup_longitude', 'pickup_latitude',
                    'dropoff_longitude', 'dropoff_latitude', 'total_amount']

dataframe = pd.read_csv(os.path.join(DATA_DIRECTORY, name), usecols=columns_selected)

In [6]:
f"Nombre de lignes : {dataframe.shape[0]}"

'Nombre de lignes : 10906858'

### Filtre les données selon bounding box sur la ville de New York

In [7]:
LATITUDE_BOX = (40.492615, 40.920618)
LONGITUDE_BOX = (-74.256196, -73.676230)

In [8]:
dataframe.columns

Index(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'total_amount'],
      dtype='object')

In [9]:
df_filter = dataframe[(dataframe.pickup_longitude.between(LONGITUDE_BOX[0], LONGITUDE_BOX[1]) & 
                       dataframe.dropoff_longitude.between(LONGITUDE_BOX[0], LONGITUDE_BOX[1]) &
                       dataframe.pickup_latitude.between(LATITUDE_BOX[0], LATITUDE_BOX[1]) &
                       dataframe.dropoff_latitude.between(LATITUDE_BOX[0], LATITUDE_BOX[1]))]

In [10]:
df_filter.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,total_amount
0,2016-01-01 00:00:00,2016-01-01 00:00:00,2,1.1,-73.990372,40.734695,-73.981842,40.732407,8.8
1,2016-01-01 00:00:00,2016-01-01 00:00:00,5,4.9,-73.980782,40.729912,-73.944473,40.716679,19.3
2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,10.54,-73.98455,40.679565,-73.950272,40.788925,34.3
3,2016-01-01 00:00:00,2016-01-01 00:00:00,1,4.75,-73.993469,40.71899,-73.962242,40.657333,17.3
4,2016-01-01 00:00:00,2016-01-01 00:00:00,3,1.76,-73.960625,40.78133,-73.977264,40.758514,8.8


In [11]:
del dataframe

### On enlève quelques données aberrantes

In [12]:
df_filter.total_amount.quantile([.0001, .9999])

0.0001     -5.80000
0.9999    154.16641
Name: total_amount, dtype: float64

In [13]:
df_filter.trip_distance.quantile([.001, .9999])

0.0010     0.00
0.9999    33.32
Name: trip_distance, dtype: float64

On récupère un échantillon des trajets

In [14]:
df_filter_light = df_filter.sample(frac=0.001)

In [15]:
df_filter_light.shape

(10712, 9)

In [16]:
df_filter_light

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,total_amount
6264359,2016-01-27 21:17:52,2016-01-27 21:28:48,1,0.88,-73.975876,40.756535,-73.980042,40.765747,9.30
1191897,2016-01-05 15:44:47,2016-01-05 15:48:13,2,0.30,-73.954247,40.811455,-73.954620,40.807476,4.80
7553152,2016-01-30 18:49:55,2016-01-30 18:55:38,1,0.77,-73.992271,40.755260,-73.979668,40.751968,6.30
6292638,2016-01-27 21:45:21,2016-01-27 21:55:13,1,2.40,-73.972527,40.756599,-73.947853,40.776169,10.80
7829302,2016-01-31 11:38:11,2016-01-31 11:44:31,1,0.90,-73.992874,40.755165,-73.985680,40.747021,8.16
...,...,...,...,...,...,...,...,...,...
808877,2016-01-03 14:24:17,2016-01-03 14:32:51,1,1.50,-73.959404,40.767410,-73.975136,40.752743,8.80
7209687,2016-01-29 22:35:18,2016-01-29 22:58:43,1,3.00,-73.993698,40.751434,-73.987747,40.724438,21.36
4303405,2016-01-26 00:17:38,2016-01-26 00:28:04,1,2.90,-73.971611,40.757206,-73.997284,40.722382,15.35
4799466,2016-01-11 14:57:12,2016-01-11 15:15:08,1,5.13,-73.973862,40.784309,-74.006599,40.730469,19.30


On observe que ...

In [17]:
df_stop = pd.concat([df_filter_light[['pickup_longitude', 'pickup_latitude']].\
                        rename(columns={"pickup_longitude": "LONGITUDE", "pickup_latitude": "LATITUDE"}), 
                     df_filter_light[['dropoff_longitude', 'dropoff_latitude']].\
                         rename(columns={"dropoff_longitude": "LONGITUDE", "dropoff_latitude": "LATITUDE"})])

In [18]:
df_stop.max()

LONGITUDE   -73.678345
LATITUDE     40.914452
dtype: float64

In [19]:
df_stop.shape

(21424, 2)

In [20]:
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [21]:

import pyhaversine
import numpy as np

In [22]:
centroid = df_stop.mean()

In [23]:
# pairs = [
#     ((45.1, 2.3), (50.2, 3.9)),
#     ((32.4, 15.7), (12.5, 84.1)),
# ]

pairs = []
for i, row in df_stop.iterrows():
    pairs.append(((centroid['LATITUDE'], centroid['LONGITUDE']), 
                  (row['LATITUDE'], row['LONGITUDE'])))

distances = np.array(pyhaversine.bulk_haversine(pairs))/1000

In [24]:
distances.max()

35.44116257945012

In [25]:
df_stop = df_stop.reset_index().drop('index', axis=1)

## Calcul des Centroids

Un seul paramètre à définir, le max radius

In [26]:
maxRadius = 5

In [27]:
from aggofmassivemvtdata.clustering.part2_algo_2 import algo_2

Warning : l'ordre `LATITUDE`, `LONGITUDE` est important lors de la transformation du `pd.DataFrame` en `np.ndarray` puis dans le passage de ce dernier dans les arguments de la fonction `algo_2`

In [28]:
grille = algo_2(df_stop[['LATITUDE', 'LONGITUDE']].to_numpy(), maxRadius, redistribute_point=False)


In [29]:
print(df_stop.shape)

(21424, 2)


In [30]:
centroids = grille.getAllCentroids()

In [31]:
f"Nombre de centroids : {centroids.shape[0]}"

'Nombre de centroids : 26'

In [32]:
import folium

In [33]:
centroid[['LATITUDE', 'LONGITUDE']]

LATITUDE     40.751836
LONGITUDE   -73.973827
dtype: float64

In [34]:
map = folium.Map(location=centroid[['LATITUDE', 'LONGITUDE']])

In [35]:
# [[x[1], x[0]] for x in centroids]

In [36]:
# for marker in df_test.values.tolist():
#     folium.Circle(
#         radius=100,
#         location=marker,
#         color="#3186cc",
#     ).add_to(map)

for c in centroids.tolist():
    folium.Circle(
        radius=50,
        location=c,
        color="crimson",
        fill=False,
    ).add_to(map)

### Visualisation des centroids

In [37]:
map

### Calcul des zones 

In [38]:
from aggofmassivemvtdata.voronoi_map.part3_voronoi import build_voronoi_map_from_centroids

In [39]:
# compute border limit
lat_min = df_stop.LATITUDE.min()
lat_max = df_stop.LATITUDE.max()
lon_min = df_stop.LONGITUDE.min()
lon_max = df_stop.LONGITUDE.max()

voronoi = build_voronoi_map_from_centroids(centroids, maxRadius, 
                                           lat_min, lat_max, lon_min, lon_max)

voronoi_rver = [[tuple(voronoi.vertices[rv[0]]), tuple(voronoi.vertices[rv[1]])]
                for rv in voronoi.ridge_vertices if rv[0] != -1 and rv[1] != -1]

In [40]:
map = folium.Map(location=centroids[0])

In [41]:
for marker in df_stop[['LATITUDE', 'LONGITUDE']].sample(frac=0.1).values.tolist():
    folium.Circle(
        radius=100,
        location=marker,
        color="#3186cc",
    ).add_to(map)
    
for rver in voronoi_rver:
    folium.PolyLine(locations=rver, 
                weight=1.0, 
                color = 'blue').add_to(map)
    
for c in centroids.tolist():
    folium.Circle(
        radius=50,
        location=c,
        color="crimson",
        fill=False,
    ).add_to(map)
    

In [42]:
map

Seules les zones avec un centroid (point rouge) sont à considérer.

### Calcul des flux entre les différentes zones

In [43]:
from aggofmassivemvtdata.clustering.part2_algo_2 import assign_centroid_to_each_point

In [44]:
df_stop = assign_centroid_to_each_point(df_stop, centroids)

In [45]:
df_stop.head(5)

Unnamed: 0,LONGITUDE,LATITUDE,CENTROID_NUMBER
0,-73.975876,40.756535,15
1,-73.954247,40.811455,21
2,-73.992271,40.75526,15
3,-73.972527,40.756599,15
4,-73.992874,40.755165,15


In [46]:
max_num_cluster = df_stop.CENTROID_NUMBER.max()

Pour le dataframe des stops, les latitudes et longitudes passent en index pour trouver plus rapidement les numéros de clusters correspondant

In [54]:
df_stop = df_stop.set_index(['LONGITUDE', 'LATITUDE']).sort_index()
# on supprime les tuples (latitude, longitude) en doublon
df_stop = df_stop[~df_stop.index.duplicated(keep='first')]

In [74]:
df_filter_light['pickup_centroid'] = \
    df_filter_light.apply(lambda row: df_stop.loc[row['pickup_longitude'], 
                                                  row['pickup_latitude']], axis=1)
df_filter_light['dropoff_centroid'] = \
    df_filter_light.apply(lambda row: df_stop.loc[row['dropoff_longitude'], 
                                                  row['dropoff_latitude']], axis=1)

In [87]:
## init matrice to count link between cluster
count_link_between_clusters = np.zeros((max_num_cluster+1, max_num_cluster+1), dtype=np.uint64)

for _, row in df_filter_light[['pickup_centroid', 'dropoff_centroid', 'passenger_count']].iterrows():
    count_link_between_clusters[row.pickup_centroid, row.dropoff_centroid] += int(row.passenger_count)