In [288]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import  silhouette_score

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

In [289]:
data = pd.read_csv("uber-raw-data-jun14.csv")
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,6/1/2014 0:00:00,40.7293,-73.992,B02512
1,6/1/2014 0:01:00,40.7131,-74.0097,B02512
2,6/1/2014 0:04:00,40.3461,-74.661,B02512
3,6/1/2014 0:04:00,40.7555,-73.9833,B02512
4,6/1/2014 0:07:00,40.688,-74.1831,B02512


In [290]:
data_sample = data.sample(30000)

In [291]:
# Basic stats
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()


print("Percentage of missing values: ")
display(100*data_sample.isnull().sum()/data_sample.shape[0])

Number of rows : 30000
Number of columns : 4

Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

In [292]:
data_sample['Date/Time']= pd.to_datetime(data_sample['Date/Time'])
data_sample.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [293]:
# split Date column into 2 parts (day and day of week) and drop Date column
data_sample['DayOfWeek'] =data_sample['Date/Time'].dt.dayofweek
data_sample['Day'] =data_sample['Date/Time'].dt.day
data_sample['Hour'] =data_sample['Date/Time'].dt.hour
data_sample.drop(['Date/Time', 'Base'], axis=1, inplace = True)

In [294]:
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
236451,40.74,-74.0023,2,25,21
313554,40.7768,-73.9597,6,8,13
168260,40.7644,-73.9688,1,17,12
205216,40.7307,-73.983,5,21,16
84549,40.7627,-73.9822,5,7,5


In [295]:
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()

Number of rows : 30000
Number of columns : 5



In [296]:
## map without clustering
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()

In [297]:
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
236451,40.74,-74.0023,2,25,21
313554,40.7768,-73.9597,6,8,13
168260,40.7644,-73.9688,1,17,12
205216,40.7307,-73.983,5,21,16
84549,40.7627,-73.9822,5,7,5


In [298]:
data_sample.dtypes

Lat          float64
Lon          float64
DayOfWeek      int64
Day            int64
Hour           int64
dtype: object

In [299]:
data_sample_0 = data_sample.loc[data_sample['DayOfWeek'] == 0]
data_sample_1 = data_sample.loc[data_sample['DayOfWeek'] == 1]
data_sample_2 = data_sample.loc[data_sample['DayOfWeek'] == 2]
data_sample_3 = data_sample.loc[data_sample['DayOfWeek'] == 3]
data_sample_4 = data_sample.loc[data_sample['DayOfWeek'] == 4]
data_sample_5 = data_sample.loc[data_sample['DayOfWeek'] == 5]
data_sample_6 = data_sample.loc[data_sample['DayOfWeek'] == 6]

In [306]:
data_sample_0.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
611019,40.7686,-73.9819,0,23,22
104203,40.7583,-73.9816,0,9,17
517676,40.7305,-73.9863,0,9,18
43860,40.7046,-74.0098,0,2,18
215716,40.6867,-73.9624,0,23,8


In [307]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample_0.head())
X = preprocessor.fit_transform(data_sample_0) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour
611019  40.7686 -73.9819          0   23    22
104203  40.7583 -73.9816          0    9    17
517676  40.7305 -73.9863          0    9    18
43860   40.7046 -74.0098          0    2    18
215716  40.6867 -73.9624          0   23     8
...Terminé.
[[ 0.75160637 -0.1855614   0.          1.51612692]
 [ 0.47738923 -0.18047019  0.          0.63574802]
 [-0.26273082 -0.26023257  0.          0.8118238 ]
 [-0.95226713 -0.65904449  0.          0.8118238 ]
 [-1.42881925  0.14536763  0.         -0.94893399]]



### DBSCAN algorithm

In [308]:
#travailler l'epsylon
db = DBSCAN(eps=0.3, min_samples=10, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.3, metric='manhattan', min_samples=10)

### Find out how many clusters DBSCAN created

In [309]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9])

In [310]:
data_sample_0["cluster_0"] = db.labels_
data_sample_0.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour,cluster_0
611019,40.7686,-73.9819,0,23,22,0
104203,40.7583,-73.9816,0,9,17,0
517676,40.7305,-73.9863,0,9,18,0
43860,40.7046,-74.0098,0,2,18,0
215716,40.6867,-73.9624,0,23,8,-1


In [311]:
data_sample_0['cluster_0'].value_counts()

 0    3195
-1     782
 1     109
 3      71
 7      47
 5      37
 4      33
 6      33
 2      11
 8      10
 9      10
Name: cluster_0, dtype: int64

In [313]:
fig = px.scatter_mapbox(
        data_sample_0[data_sample_0.cluster_0 != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster_0",
        mapbox_style="carto-positron"
)

fig.show()

In [315]:
px.scatter_mapbox(
    data_sample_0.loc[data_sample_0.cluster_0 != -1, :],
    lat="Lat",
    lon="Lon",
    animation_frame="Hour",
    mapbox_style="carto-positron"
)

In [316]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample_1.head())
X = preprocessor.fit_transform(data_sample_1) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour
168260  40.7644 -73.9688          1   17    12
573146  40.7148 -73.9639          1   17    21
573211  40.7740 -73.8720          1   17    21
523682  40.7335 -73.9898          1   10    17
52782   40.7644 -73.9988          1    3    19
...Terminé.
[[ 0.60534902  0.13948988  0.         -0.44874559]
 [-0.84044852  0.23743459  0.          1.22695113]
 [ 0.8851808   2.07439777  0.          1.22695113]
 [-0.29535953 -0.2802732   0.          0.48219703]
 [ 0.60534902 -0.46017166  0.          0.85457408]]



In [317]:
db.fit(X)

DBSCAN(eps=0.3, metric='manhattan', min_samples=10)

In [318]:
data_sample_1["cluster_1"] = db.labels_
fig = px.scatter_mapbox(
        data_sample_1[data_sample_1.cluster_1 != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster_1",
        mapbox_style="carto-positron"
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [320]:
fig = px.scatter_mapbox(
        data_sample_1[data_sample_1.cluster_1 != -1], 
        lat="Lat", 
        lon="Lon",
        animation_frame="Hour",
        mapbox_style="carto-positron"
)

fig.show()

### KMEAN algorithm

In [None]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Day", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour
316394  40.7371 -74.0003          0    9     5
487310  40.7056 -74.0068          3    5    10
26578   40.7804 -73.9818          2   25     7
136209  40.6881 -73.9606          4   13     6
388080  40.7639 -73.9734          4   20    11
...Terminé.
[[-0.07404942 -0.45836441 -1.52070168 -0.75614111 -1.55442356]
 [-0.90455158 -0.57391843  0.04643962 -1.22679765 -0.70370703]
 [ 1.06756149 -0.12947988 -0.47594081  1.12648506 -1.21413695]
 [-1.36594167  0.24740401  0.56882005 -0.28548457 -1.38428025]
 [ 0.63253655  0.01985147  0.56882005  0.53816438 -0.53356373]]



In [None]:
wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[127115.93420085032, 110082.9775682237, 95673.59543771183, 85825.8917279382, 78596.7318767508, 72808.98034272448, 68099.51749445609, 63099.25888544457, 59789.713321692834]


In [None]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [None]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.1793360492278141, 0.19724474101593742, 0.1921017768645788, 0.1932276456346268, 0.19400270912394185, 0.1976070625365733, 0.1971838158746964, 0.2038045482570396, 0.20312485879688089]


In [None]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,11), y = s_score)
fig.show()

In [None]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 4)
kmeans.fit(X)

KMeans(n_clusters=4)

In [None]:
data_sample.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour,Cluster_KMeans
316394,40.7371,-74.0003,0,9,5,1
487310,40.7056,-74.0068,3,5,10,1
26578,40.7804,-73.9818,2,25,7,1
136209,40.6881,-73.9606,4,13,6,1
388080,40.7639,-73.9734,4,20,11,1


In [None]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron"
)

fig.show()

In [None]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()