In [361]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import  silhouette_score

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

In [362]:
data = pd.read_csv("uber-raw-data-jun14.csv")
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,6/1/2014 0:00:00,40.7293,-73.992,B02512
1,6/1/2014 0:01:00,40.7131,-74.0097,B02512
2,6/1/2014 0:04:00,40.3461,-74.661,B02512
3,6/1/2014 0:04:00,40.7555,-73.9833,B02512
4,6/1/2014 0:07:00,40.688,-74.1831,B02512


In [363]:
data_sample = data.sample(20000)

In [364]:
# Basic stats
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()


print("Percentage of missing values: ")
display(100*data_sample.isnull().sum()/data_sample.shape[0])

Number of rows : 20000
Number of columns : 4

Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

In [365]:
data_sample['Date/Time']= pd.to_datetime(data_sample['Date/Time'])
data_sample.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [366]:
# split Date column into 2 parts (day and day of week) and drop Date column
data_sample['DayOfWeek'] =data_sample['Date/Time'].dt.dayofweek
data_sample['Day'] =data_sample['Date/Time'].dt.day
data_sample['Hour'] =data_sample['Date/Time'].dt.hour
data_sample.drop(['Date/Time', 'Base'], axis=1, inplace = True)

In [367]:
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
424671,40.7742,-73.873,3,26,0
583140,40.7456,-73.9991,3,19,11
538372,40.7581,-73.9624,3,12,16
507963,40.7443,-74.0066,6,8,3
440526,40.7023,-74.0129,4,27,21


In [368]:
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()

Number of rows : 20000
Number of columns : 5



In [369]:
## map without clustering
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()

In [370]:
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
424671,40.7742,-73.873,3,26,0
583140,40.7456,-73.9991,3,19,11
538372,40.7581,-73.9624,3,12,16
507963,40.7443,-74.0066,6,8,3
440526,40.7023,-74.0129,4,27,21


In [371]:
numeric_features = ["Lat", "Lon", "DayOfWeek"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour
424671  40.7742 -73.8730          3   26     0
583140  40.7456 -73.9991          3   19    11
538372  40.7581 -73.9624          3   12    16
507963  40.7443 -74.0066          6    8     3
440526  40.7023 -74.0129          4   27    21
...Terminé.
[[ 0.89071109  1.7447877   0.05194887]
 [ 0.14663024 -0.44039866  0.05194887]
 [ 0.4718404   0.19557548  0.05194887]
 [ 0.11280838 -0.57036613  1.61903396]
 [-0.97989776 -0.6795388   0.57431057]]



### DBSCAN algorithm

In [372]:
#travailler l'epsylon
db = DBSCAN(eps=0.3, min_samples=20, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.3, metric='manhattan', min_samples=20)

### Find out how many clusters DBSCAN created

In [373]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26])

In [374]:
data_sample["cluster"] = db.labels_
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour,cluster
424671,40.7742,-73.873,3,26,0,0
583140,40.7456,-73.9991,3,19,11,1
538372,40.7581,-73.9624,3,12,16,1
507963,40.7443,-74.0066,6,8,3,2
440526,40.7023,-74.0129,4,27,21,3


In [375]:
data_sample['cluster'].value_counts()

 1     3118
 3     2869
 6     2602
 5     2491
 4     2358
 7     2121
 2     1876
-1     1421
 10     109
 16     103
 9       98
 0       97
 8       73
 12      68
 19      68
 24      61
 14      59
 15      57
 17      56
 11      55
 25      46
 13      38
 18      33
 20      28
 21      25
 26      25
 23      23
 22      22
Name: cluster, dtype: int64

In [376]:
fig = px.scatter_mapbox(
        data_sample[data_sample.cluster != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster",
        mapbox_style="carto-positron"
)

fig.show()

In [377]:
px.scatter_mapbox(
    data_sample.loc[data_sample.cluster != -1, :],
    lat="Lat",
    lon="Lon",
    animation_frame="DayOfWeek",
    mapbox_style="carto-positron"
)

In [378]:
numeric_features = ["Lat", "Lon", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour  cluster
424671  40.7742 -73.8730          3   26     0        0
583140  40.7456 -73.9991          3   19    11        1
538372  40.7581 -73.9624          3   12    16        1
507963  40.7443 -74.0066          6    8     3        2
440526  40.7023 -74.0129          4   27    21        3
...Terminé.
[[ 0.89071109  1.7447877  -2.40013389]
 [ 0.14663024 -0.44039866 -0.53048023]
 [ 0.4718404   0.19557548  0.31936234]
 [ 0.11280838 -0.57036613 -1.89022835]
 [-0.97989776 -0.6795388   1.16920491]]



In [379]:
#travailler l'epsylon
db = DBSCAN(eps=0.3, min_samples=50, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.3, metric='manhattan', min_samples=50)

In [380]:
data_sample["cluster_2"] = db.labels_
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour,cluster,cluster_2
424671,40.7742,-73.873,3,26,0,0,-1
583140,40.7456,-73.9991,3,19,11,1,0
538372,40.7581,-73.9624,3,12,16,1,0
507963,40.7443,-74.0066,6,8,3,2,1
440526,40.7023,-74.0129,4,27,21,3,0


In [381]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6])

In [382]:
fig = px.scatter_mapbox(
        data_sample[data_sample.cluster != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster_2",
        mapbox_style="carto-positron"
)

fig.show()

In [383]:
fig = px.scatter_mapbox(
        data_sample[data_sample.cluster != -1], 
        lat="Lat", 
        lon="Lon",
        animation_frame="Hour",
         mapbox_style="carto-positron"
)

fig.show()

### KMEAN algorithm

In [384]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Day", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour  cluster  cluster_2
424671  40.7742 -73.8730          3   26     0        0         -1
583140  40.7456 -73.9991          3   19    11        1          0
538372  40.7581 -73.9624          3   12    16        1          0
507963  40.7443 -74.0066          6    8     3        2          1
440526  40.7023 -74.0129          4   27    21        3          0
...Terminé.
[[ 0.89071109  1.7447877   0.05194887  1.24267678 -2.40013389]
 [ 0.14663024 -0.44039866  0.05194887  0.41400856 -0.53048023]
 [ 0.4718404   0.19557548  0.05194887 -0.41465966  0.31936234]
 [ 0.11280838 -0.57036613  1.61903396 -0.88818436 -1.89022835]
 [-0.97989776 -0.6795388   0.57431057  1.36105796  1.16920491]]



In [385]:
wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[84749.70235336671, 73331.6243546101, 63674.3046470172, 56897.16280228199, 52122.46881277285, 48360.12132230245, 45348.08188441339, 42390.72143177033, 40149.03036967822]


In [386]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [387]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.18031043549614967, 0.19310819814902944, 0.1965335095961214, 0.1961811149366666, 0.19554545526853176, 0.19795744255614353, 0.19671808296095838, 0.20067826859141777, 0.20087442659093108]


In [388]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,11), y = s_score)
fig.show()

In [389]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 4)
kmeans.fit(X)

KMeans(n_clusters=4)

In [390]:
data_sample.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour,cluster,cluster_2,Cluster_KMeans
424671,40.7742,-73.873,3,26,0,0,-1,2
583140,40.7456,-73.9991,3,19,11,1,0,2
538372,40.7581,-73.9624,3,12,16,1,0,1
507963,40.7443,-74.0066,6,8,3,2,1,2
440526,40.7023,-74.0129,4,27,21,3,0,0


In [391]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron"
)

fig.show()

In [392]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()