In [217]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import  silhouette_score

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

In [218]:
data = pd.read_csv("uber-raw-data-jun14.csv")
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,6/1/2014 0:00:00,40.7293,-73.992,B02512
1,6/1/2014 0:01:00,40.7131,-74.0097,B02512
2,6/1/2014 0:04:00,40.3461,-74.661,B02512
3,6/1/2014 0:04:00,40.7555,-73.9833,B02512
4,6/1/2014 0:07:00,40.688,-74.1831,B02512


In [219]:
data_sample = data.sample(20000)

In [220]:
# Basic stats
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()


print("Percentage of missing values: ")
display(100*data_sample.isnull().sum()/data_sample.shape[0])

Number of rows : 20000
Number of columns : 4

Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

In [221]:
data_sample['Date/Time']= pd.to_datetime(data_sample['Date/Time'])
data_sample.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [222]:
# split Date column into 2 parts (day and day of week) and drop Date column
data_sample['DayOfWeek'] =data_sample['Date/Time'].dt.dayofweek
data_sample['Day'] =data_sample['Date/Time'].dt.day
data_sample['Hour'] =data_sample['Date/Time'].dt.hour
data_sample.drop('Date/Time', axis=1, inplace = True)

In [223]:
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour
625432,40.7558,-73.9922,B02682,3,26,7
215640,40.7434,-73.9769,B02598,0,23,7
279284,40.7295,-74.0071,B02617,0,2,1
559640,40.7424,-73.9818,B02682,6,15,11
359394,40.7473,-73.9739,B02617,6,15,22


In [224]:
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()

Number of rows : 20000
Number of columns : 6



In [225]:
## map without clustering
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()

In [226]:
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour
625432,40.7558,-73.9922,B02682,3,26,7
215640,40.7434,-73.9769,B02598,0,23,7
279284,40.7295,-74.0071,B02617,0,2,1
559640,40.7424,-73.9818,B02682,6,15,11
359394,40.7473,-73.9739,B02617,6,15,22


In [227]:
# convert Base column to string

data_sample= data_sample.astype({'Base':'string'})
data_sample.dtypes

Lat          float64
Lon          float64
Base          string
DayOfWeek      int64
Day            int64
Hour           int64
dtype: object

In [228]:
numeric_features = ["Lat", "Lon", "DayOfWeek"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# Création du transformer pour les variables catégorielles
categorical_features = ["Base"] # Positions des colonnes catégorielles dans X
categorical_transformer = OneHotEncoder(drop='first')

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon    Base  DayOfWeek  Day  Hour
625432  40.7558 -73.9922  B02682          3   26     7
215640  40.7434 -73.9769  B02598          0   23     7
279284  40.7295 -74.0071  B02617          0    2     1
559640  40.7424 -73.9818  B02682          6   15    11
359394  40.7473 -73.9739  B02617          6   15    22
...Terminé.
[[ 0.41232702 -0.32229145  0.02916381  0.          0.          1.
   0.        ]
 [ 0.08608084 -0.0581061  -1.53178963  1.          0.          0.
   0.        ]
 [-0.2796306  -0.57956999 -1.53178963  0.          1.          0.
   0.        ]
 [ 0.05977067 -0.14271448  1.59011726  0.          0.          1.
   0.        ]
 [ 0.18869053 -0.00630505  1.59011726  0.          1.          0.
   0.        ]]



### DBSCAN algorithm

In [229]:
#travailler l'epsylon
db = DBSCAN(eps=0.2, min_samples=20, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.2, metric='manhattan', min_samples=20)

### Find out how many clusters DBSCAN created

In [230]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50])

In [231]:
data_sample["cluster"] = db.labels_
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour,cluster
625432,40.7558,-73.9922,B02682,3,26,7,0
215640,40.7434,-73.9769,B02598,0,23,7,1
279284,40.7295,-74.0071,B02617,0,2,1,2
559640,40.7424,-73.9818,B02682,6,15,11,3
359394,40.7473,-73.9739,B02617,6,15,22,21


In [232]:
data_sample['cluster'].value_counts()

-1     5593
 15    1030
 16     877
 8      854
 0      814
 1      814
 14     770
 4      748
 13     693
 10     649
 17     644
 9      634
 24     608
 11     596
 12     576
 20     540
 2      537
 5      515
 6      454
 7      442
 3      390
 21     380
 36      53
 30      49
 26      47
 19      41
 29      37
 23      37
 32      32
 43      32
 48      30
 28      30
 42      27
 22      26
 40      24
 37      24
 35      24
 41      24
 18      24
 38      23
 46      23
 49      22
 34      22
 45      22
 25      22
 27      22
 39      22
 31      22
 33      21
 44      20
 47      20
 50      20
Name: cluster, dtype: int64

In [233]:
fig = px.scatter_mapbox(
        data_sample[data_sample.cluster != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster",
        mapbox_style="carto-positron"
)

fig.show()

In [234]:
px.scatter_mapbox(
    data_sample.loc[data_sample.cluster != -1, :],
    lat="Lat",
    lon="Lon",
    animation_frame="DayOfWeek",
    mapbox_style="carto-positron"
)

In [246]:
px.scatter_mapbox(
    data_sample.loc[data_sample.cluster != -1, :],
    lat="Lat",
    lon="Lon",
    animation_frame="Hour",
    mapbox_style="carto-positron"
)

### KMEAN algorithm

In [236]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Day", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# Création du transformer pour les variables catégorielles
categorical_features = ["Base"] # Positions des colonnes catégorielles dans X
categorical_transformer = OneHotEncoder(drop='first')

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon    Base  DayOfWeek  Day  Hour  cluster
625432  40.7558 -73.9922  B02682          3   26     7        0
215640  40.7434 -73.9769  B02598          0   23     7        1
279284  40.7295 -74.0071  B02617          0    2     1        2
559640  40.7424 -73.9818  B02682          6   15    11        3
359394  40.7473 -73.9739  B02617          6   15    22       21
...Terminé.
[[ 0.41232702 -0.32229145  0.02916381  1.22889997 -1.21986802  0.
   0.          1.          0.        ]
 [ 0.08608084 -0.0581061  -1.53178963  0.87539533 -1.21986802  1.
   0.          0.          0.        ]
 [-0.2796306  -0.57956999 -1.53178963 -1.59913717 -2.23975349  0.
   1.          0.          0.        ]
 [ 0.05977067 -0.14271448  1.59011726 -0.06728372 -0.53994437  0.
   0.          1.          0.        ]
 [ 0.18869053 -0.00630505  1.59011726 -0.06728372  1.32984567  0.
   1.          0.          0.        ]]



In [237]:
wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[97849.27107926384, 86817.41742741913, 76621.93671131735, 70132.12823631648, 65187.44950252326, 60972.35440464641, 57899.57673249682, 54974.304336337926, 52768.24083688311]


In [238]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [239]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.14876573730849738, 0.1583570297234134, 0.15781802700193567, 0.15494019266664963, 0.1553342504158505, 0.15573924540823184, 0.15355261341549684, 0.15224003122808968, 0.16008018103673616]


In [240]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,11), y = s_score)
fig.show()

In [241]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 3)
kmeans.fit(X)

KMeans(n_clusters=3)

In [242]:
data_sample.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour,cluster,Cluster_KMeans
625432,40.7558,-73.9922,B02682,3,26,7,0,1
215640,40.7434,-73.9769,B02598,0,23,7,1,1
279284,40.7295,-74.0071,B02617,0,2,1,2,2
559640,40.7424,-73.9818,B02682,6,15,11,3,1
359394,40.7473,-73.9739,B02617,6,15,22,21,1


In [243]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron"
)

fig.show()

In [244]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()

In [245]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="Base",
        mapbox_style="carto-positron"
)

fig.show()