In [1]:
from utils import data_from_url
import os
import pandas as pd

URL_YELLOW_TAXI_201701 = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv"
DATA_DIRECTORY = "../data"

### Télécharge les données si non présente en local

In [2]:
data_from_url(URL_YELLOW_TAXI_201701, DATA_DIRECTORY)

File already downloaded


### Chargement des données

In [3]:
name = URL_YELLOW_TAXI_201701.rsplit('/', 1)[-1]
columns_selected = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count',
                    'trip_distance', 'pickup_longitude', 'pickup_latitude',
                    'dropoff_longitude', 'dropoff_latitude', 'total_amount']

dataframe = pd.read_csv(os.path.join(DATA_DIRECTORY, name), usecols=columns_selected)

In [4]:
f"Nombre de lignes : {dataframe.shape[0]}"

'Nombre de lignes : 10906858'

### Filtre les données selon bounding box sur la ville de New York

In [5]:
LATITUDE_BOX = (40.492615, 40.920618)
LONGITUDE_BOX = (-74.256196, -73.676230)

In [6]:
dataframe.columns

Index(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'total_amount'],
      dtype='object')

In [7]:
df_filter = dataframe[(dataframe.pickup_longitude.between(LONGITUDE_BOX[0], LONGITUDE_BOX[1]) & 
                       dataframe.dropoff_longitude.between(LONGITUDE_BOX[0], LONGITUDE_BOX[1]) &
                       dataframe.pickup_latitude.between(LATITUDE_BOX[0], LATITUDE_BOX[1]) &
                       dataframe.dropoff_latitude.between(LATITUDE_BOX[0], LATITUDE_BOX[1]))]

In [8]:
df_filter.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,total_amount
0,2016-01-01 00:00:00,2016-01-01 00:00:00,2,1.1,-73.990372,40.734695,-73.981842,40.732407,8.8
1,2016-01-01 00:00:00,2016-01-01 00:00:00,5,4.9,-73.980782,40.729912,-73.944473,40.716679,19.3
2,2016-01-01 00:00:00,2016-01-01 00:00:00,1,10.54,-73.98455,40.679565,-73.950272,40.788925,34.3
3,2016-01-01 00:00:00,2016-01-01 00:00:00,1,4.75,-73.993469,40.71899,-73.962242,40.657333,17.3
4,2016-01-01 00:00:00,2016-01-01 00:00:00,3,1.76,-73.960625,40.78133,-73.977264,40.758514,8.8


In [9]:
del dataframe

### On enlève quelques données aberrantes

In [10]:
df_filter.total_amount.quantile([.0001, .9999])

0.0001     -5.80000
0.9999    154.16641
Name: total_amount, dtype: float64

In [11]:
df_filter.trip_distance.quantile([.001, .9999])

0.0010     0.00
0.9999    33.32
Name: trip_distance, dtype: float64

On récupère un échantillon des trajets

In [12]:
df_filter_light = df_filter.sample(frac=0.001)

In [13]:
df_filter_light.shape

(10712, 9)

In [14]:
df_filter_light

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,total_amount
6614947,2016-01-19 08:24:49,2016-01-19 08:54:09,6,6.48,-73.959732,40.762829,-74.006348,40.706108,29.50
4465191,2016-01-26 08:07:07,2016-01-26 08:20:25,1,2.20,-73.993210,40.736568,-73.975197,40.755402,12.98
10563699,2016-01-09 13:09:12,2016-01-09 13:15:35,2,1.44,-73.960419,40.766186,-73.975250,40.751842,7.80
9125411,2016-01-22 21:06:03,2016-01-22 21:26:14,2,5.90,-73.995682,40.732834,-73.969330,40.797783,22.30
8964664,2016-01-22 14:09:34,2016-01-22 14:13:48,2,0.57,-73.986122,40.743423,-73.980789,40.750774,6.80
...,...,...,...,...,...,...,...,...,...
4289534,2016-01-25 23:45:01,2016-01-25 23:57:42,1,2.10,-73.974625,40.750549,-73.986267,40.770714,14.75
9613904,2016-01-05 12:48:50,2016-01-05 12:58:56,5,1.42,-73.987404,40.741631,-73.978416,40.758144,11.00
3877119,2016-01-13 20:23:17,2016-01-13 20:59:15,1,18.00,-73.783241,40.648613,-73.952995,40.814522,68.60
9253982,2016-01-23 08:46:08,2016-01-23 08:55:11,1,0.92,-73.963142,40.774231,-73.952682,40.780720,7.80


On observe que ...

In [15]:
df_stop = pd.concat([df_filter_light[['pickup_longitude', 'pickup_latitude']].\
                        rename(columns={"pickup_longitude": "LONGITUDE", "pickup_latitude": "LATITUDE"}), 
                     df_filter_light[['dropoff_longitude', 'dropoff_latitude']].\
                         rename(columns={"dropoff_longitude": "LONGITUDE", "dropoff_latitude": "LATITUDE"})])

In [16]:
df_stop.max()

LONGITUDE   -73.686111
LATITUDE     40.908741
dtype: float64

In [17]:
df_stop.shape

(21424, 2)

In [18]:
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [19]:

import pyhaversine
import numpy as np

In [20]:
centroid = df_stop.mean()

In [21]:
# pairs = [
#     ((45.1, 2.3), (50.2, 3.9)),
#     ((32.4, 15.7), (12.5, 84.1)),
# ]

pairs = []
for i, row in df_stop.iterrows():
    pairs.append(((centroid['LATITUDE'], centroid['LONGITUDE']), 
                  (row['LATITUDE'], row['LONGITUDE'])))

distances = np.array(pyhaversine.bulk_haversine(pairs))/1000

In [22]:
distances.max()

25.719949849607325

In [23]:
df_stop = df_stop.reset_index().drop('index', axis=1)

## Calcul des Centroids

Un seul paramètre à définir, le max radius

In [24]:
maxRadius = 5

In [25]:
from aggofmassivemvtdata.clustering.part2_algo_2 import algo_2

Warning : l'ordre `LATITUDE`, `LONGITUDE` est important lors de la transformation du `pd.DataFrame` en `np.ndarray` puis dans le passage de ce dernier dans les arguments de la fonction `algo_2`

In [26]:
grille = algo_2(df_stop[['LATITUDE', 'LONGITUDE']].to_numpy(), maxRadius, redistribute_point=False)


In [27]:
print(df_stop.shape)

(21424, 2)


In [28]:
centroids = grille.getAllCentroids()

In [29]:
f"Nombre de centroids : {centroids.shape[0]}"

'Nombre de centroids : 28'

In [30]:
import folium

In [31]:
centroid[['LATITUDE', 'LONGITUDE']]

LATITUDE     40.751948
LONGITUDE   -73.972721
dtype: float64

In [32]:
map = folium.Map(location=centroid[['LATITUDE', 'LONGITUDE']])

In [33]:
# [[x[1], x[0]] for x in centroids]

In [34]:
# for marker in df_test.values.tolist():
#     folium.Circle(
#         radius=100,
#         location=marker,
#         color="#3186cc",
#     ).add_to(map)

for c in centroids.tolist():
    folium.Circle(
        radius=50,
        location=c,
        color="crimson",
        fill=False,
    ).add_to(map)

### Visualisation des centroids

In [35]:
map

### Calcul des zones 

In [36]:
from aggofmassivemvtdata.voronoi_map.part3_voronoi import build_voronoi_map_from_centroids

In [37]:
# compute border limit
lat_min = df_stop.LATITUDE.min()
lat_max = df_stop.LATITUDE.max()
lon_min = df_stop.LONGITUDE.min()
lon_max = df_stop.LONGITUDE.max()

voronoi = build_voronoi_map_from_centroids(centroids, maxRadius, 
                                           lat_min, lat_max, lon_min, lon_max)

voronoi_rver = [[tuple(voronoi.vertices[rv[0]]), tuple(voronoi.vertices[rv[1]])]
                for rv in voronoi.ridge_vertices if rv[0] != -1 and rv[1] != -1]

In [38]:
map = folium.Map(location=centroids[0])

In [39]:
for marker in df_stop[['LATITUDE', 'LONGITUDE']].sample(frac=0.1).values.tolist():
    folium.Circle(
        radius=100,
        location=marker,
        color="#3186cc",
    ).add_to(map)
    
for rver in voronoi_rver:
    folium.PolyLine(locations=rver, 
                weight=1.0, 
                color = 'blue').add_to(map)
    
for c in centroids.tolist():
    folium.Circle(
        radius=50,
        location=c,
        color="crimson",
        fill=False,
    ).add_to(map)
    

In [40]:
map

Seules les zones avec un centroid (point rouge) sont à considérer.

### Calcul des flux entre les différentes zones

In [41]:
from aggofmassivemvtdata.clustering.part2_algo_2 import assign_centroid_to_each_point

In [42]:
df_stop = assign_centroid_to_each_point(df_stop, centroids)

In [43]:
df_stop.head(5)

Unnamed: 0,LONGITUDE,LATITUDE,CENTROID_NUMBER
0,-73.959732,40.762829,20
1,-73.99321,40.736568,15
2,-73.960419,40.766186,20
3,-73.995682,40.732834,15
4,-73.986122,40.743423,14
