In [108]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import  silhouette_score

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

In [109]:
data = pd.read_csv("uber-raw-data-jun14.csv")
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,6/1/2014 0:00:00,40.7293,-73.992,B02512
1,6/1/2014 0:01:00,40.7131,-74.0097,B02512
2,6/1/2014 0:04:00,40.3461,-74.661,B02512
3,6/1/2014 0:04:00,40.7555,-73.9833,B02512
4,6/1/2014 0:07:00,40.688,-74.1831,B02512


In [110]:
data_sample = data.sample(20000)

In [111]:
# Basic stats
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()


print("Percentage of missing values: ")
display(100*data_sample.isnull().sum()/data_sample.shape[0])

Number of rows : 20000
Number of columns : 4

Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

In [112]:
data_sample['Date/Time']= pd.to_datetime(data_sample['Date/Time'])
data_sample.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [113]:
# split Date column into 2 parts (day and day of week) and drop Date column
data_sample['DayOfWeek'] =data_sample['Date/Time'].dt.dayofweek
data_sample['Day'] =data_sample['Date/Time'].dt.day
data_sample['Hour'] =data_sample['Date/Time'].dt.hour
data_sample.drop('Date/Time', axis=1, inplace = True)

In [114]:
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour
184938,40.6708,-73.9931,B02598,3,19,10
101308,40.7755,-73.9501,B02598,0,9,11
457069,40.7505,-73.9738,B02617,0,30,15
188287,40.757,-73.9782,B02598,3,19,16
114074,40.7876,-73.8369,B02598,1,10,21


In [115]:
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()

Number of rows : 20000
Number of columns : 6



In [116]:
## map without clustering
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()

In [117]:
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour
184938,40.6708,-73.9931,B02598,3,19,10
101308,40.7755,-73.9501,B02598,0,9,11
457069,40.7505,-73.9738,B02617,0,30,15
188287,40.757,-73.9782,B02598,3,19,16
114074,40.7876,-73.8369,B02598,1,10,21


In [118]:
# convert Base column to string

data_sample= data_sample.astype({'Base':'string'})
data_sample.dtypes

Lat          float64
Lon          float64
Base          string
DayOfWeek      int64
Day            int64
Hour           int64
dtype: object

In [119]:
numeric_features = ["Lat", "Lon", "DayOfWeek"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# Création du transformer pour les variables catégorielles
categorical_features = ["Base"] # Positions des colonnes catégorielles dans X
categorical_transformer = OneHotEncoder(drop='first')

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon    Base  DayOfWeek  Day  Hour
184938  40.6708 -73.9931  B02598          3   19    10
101308  40.7755 -73.9501  B02598          0    9    11
457069  40.7505 -73.9738  B02617          0   30    15
188287  40.7570 -73.9782  B02598          3   19    16
114074  40.7876 -73.8369  B02598          1   10    21
...Terminé.
[[-1.75085089 -0.33621483  0.0286627   1.          0.          0.
   0.        ]
 [ 0.89647338  0.40702043 -1.53191992  1.          0.          0.
   0.        ]
 [ 0.26435202 -0.00262319 -1.53191992  0.          1.          0.
   0.        ]
 [ 0.42870357 -0.07867517  0.0286627   1.          0.          0.
   0.        ]
 [ 1.20242012  2.36363047 -1.01172572  1.          0.          0.
   0.        ]]



### DBSCAN algorithm

In [120]:
db = DBSCAN(eps=0.2, min_samples=100, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.2, metric='manhattan', min_samples=100)

### Find out how many clusters DBSCAN created

In [121]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])

In [122]:
data_sample["cluster"] = db.labels_
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour,cluster
184938,40.6708,-73.9931,B02598,3,19,10,-1
101308,40.7755,-73.9501,B02598,0,9,11,-1
457069,40.7505,-73.9738,B02617,0,30,15,-1
188287,40.757,-73.9782,B02598,3,19,16,0
114074,40.7876,-73.8369,B02598,1,10,21,-1


In [123]:
data_sample['cluster'].value_counts()

-1     16846
 0       513
 1       344
 9       266
 5       252
 7       241
 4       214
 3       213
 6       207
 8       205
 2       198
 10      171
 12      119
 11      111
 13      100
Name: cluster, dtype: int64

In [124]:
fig = px.scatter_mapbox(
        data_sample[data_sample.cluster != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster",
        mapbox_style="carto-positron"
)

fig.show()

In [125]:
px.scatter_mapbox(
    data_sample.loc[data_sample.cluster != -1, :],
    lat="Lat",
    lon="Lon",
    color="DayOfWeek",
    mapbox_style="carto-positron"
)

In [126]:
numeric_features = ["Lat", "Lon", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# Création du transformer pour les variables catégorielles
categorical_features = ["Base"] # Positions des colonnes catégorielles dans X
categorical_transformer = OneHotEncoder(drop='first')

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon    Base  DayOfWeek  Day  Hour  cluster
184938  40.6708 -73.9931  B02598          3   19    10       -1
101308  40.7755 -73.9501  B02598          0    9    11       -1
457069  40.7505 -73.9738  B02617          0   30    15       -1
188287  40.7570 -73.9782  B02598          3   19    16        0
114074  40.7876 -73.8369  B02598          1   10    21       -1
...Terminé.
[[-1.75085089 -0.33621483 -0.70096329  1.          0.          0.
   0.        ]
 [ 0.89647338  0.40702043 -0.53253336  1.          0.          0.
   0.        ]
 [ 0.26435202 -0.00262319  0.1411864   0.          1.          0.
   0.        ]
 [ 0.42870357 -0.07867517  0.30961633  1.          0.          0.
   0.        ]
 [ 1.20242012  2.36363047  1.15176602  1.          0.          0.
   0.        ]]



### KMEAN algorithm

In [128]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Day", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# Création du transformer pour les variables catégorielles
categorical_features = ["Base"] # Positions des colonnes catégorielles dans X
categorical_transformer = OneHotEncoder(drop='first')

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon    Base  DayOfWeek  Day  Hour  cluster
184938  40.6708 -73.9931  B02598          3   19    10       -1
101308  40.7755 -73.9501  B02598          0    9    11       -1
457069  40.7505 -73.9738  B02617          0   30    15       -1
188287  40.7570 -73.9782  B02598          3   19    16        0
114074  40.7876 -73.8369  B02598          1   10    21       -1
...Terminé.
[[-1.75085089 -0.33621483  0.0286627   0.42307396 -0.70096329  1.
   0.          0.          0.        ]
 [ 0.89647338  0.40702043 -1.53191992 -0.7590601  -0.53253336  1.
   0.          0.          0.        ]
 [ 0.26435202 -0.00262319 -1.53191992  1.72342143  0.1411864   0.
   1.          0.          0.        ]
 [ 0.42870357 -0.07867517  0.0286627   0.42307396  0.30961633  1.
   0.          0.          0.        ]
 [ 1.20242012  2.36363047 -1.01172572 -0.6408467   1.15176602  1.
   0.          0.          0.        ]]



In [129]:
wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[97839.11422098713, 86636.5763916203, 77077.26063026098, 70346.7890214338, 65574.04780281223, 61976.10380109231, 58882.13896592005, 55756.41151066355, 53563.827784065885]


In [130]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [131]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.15231263177054677, 0.16619999340397165, 0.15936467330814574, 0.15770821872793891, 0.1537921341712154, 0.15529692296350678, 0.1515047854569283, 0.15519901821006202, 0.1516411047090534]


In [132]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,11), y = s_score)
fig.show()

In [138]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 3)
kmeans.fit(X)

KMeans(n_clusters=3)

In [139]:
data_sample.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour,cluster,Cluster_KMeans
184938,40.6708,-73.9931,B02598,3,19,10,-1,1
101308,40.7755,-73.9501,B02598,0,9,11,-1,2
457069,40.7505,-73.9738,B02617,0,30,15,-1,1
188287,40.757,-73.9782,B02598,3,19,16,0,1
114074,40.7876,-73.8369,B02598,1,10,21,-1,2


In [140]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron"
)

fig.show()

In [141]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()

In [142]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="Base",
        mapbox_style="carto-positron"
)

fig.show()