In [290]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

In [291]:
data = pd.read_csv("uber-raw-data-may14.csv")
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,5/1/2014 0:02:00,40.7521,-73.9914,B02512
1,5/1/2014 0:06:00,40.6965,-73.9715,B02512
2,5/1/2014 0:15:00,40.7464,-73.9838,B02512
3,5/1/2014 0:17:00,40.7463,-74.0011,B02512
4,5/1/2014 0:17:00,40.7594,-73.9734,B02512


In [292]:
data_sample = data.sample(20000)

In [293]:
# Basic stats
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()


print("Percentage of missing values: ")
display(100*data_sample.isnull().sum()/data_sample.shape[0])

Number of rows : 20000
Number of columns : 4

Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

In [294]:
data_sample['Date/Time']= pd.to_datetime(data_sample['Date/Time'])
data_sample.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [295]:
# split Date column into 2 parts (day and day of week) and drop Date column
data_sample['DayOfWeek'] =data_sample['Date/Time'].dt.dayofweek
data_sample['Day'] =data_sample['Date/Time'].dt.day
data_sample['Hour'] =data_sample['Date/Time'].dt.hour

In [296]:
data_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,DayOfWeek,Day,Hour
316387,2014-05-06 15:29:00,40.9122,-73.8396,B02617,1,6,15
324228,2014-05-08 14:15:00,40.7713,-73.9663,B02617,3,8,14
621205,2014-05-29 06:27:00,40.6466,-73.79,B02682,3,29,6
412662,2014-05-30 16:23:00,40.7233,-73.9827,B02617,4,30,16
74768,2014-05-05 19:18:00,40.7598,-73.9792,B02598,0,5,19


In [297]:
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()

Number of rows : 20000
Number of columns : 7



In [298]:
## map without clustering
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()

In [299]:
data_sample.drop('Date/Time', axis = 1, inplace = True)

In [300]:
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour
316387,40.9122,-73.8396,B02617,1,6,15
324228,40.7713,-73.9663,B02617,3,8,14
621205,40.6466,-73.79,B02682,3,29,6
412662,40.7233,-73.9827,B02617,4,30,16
74768,40.7598,-73.9792,B02598,0,5,19


In [301]:
## map without clustering
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="Day",
        mapbox_style="carto-positron"
)

fig.show()

In [302]:
# convert Base column to string

data_sample= data_sample.astype({'Base':'string'})
data_sample.dtypes

Lat          float64
Lon          float64
Base          string
DayOfWeek      int64
Day            int64
Hour           int64
dtype: object

In [303]:
numeric_features = ["Lat", "Lon", "DayOfWeek"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# Création du transformer pour les variables catégorielles
categorical_features = ["Base"] # Positions des colonnes catégorielles dans X
categorical_transformer = OneHotEncoder(drop='first')

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon    Base  DayOfWeek  Day  Hour
316387  40.9122 -73.8396  B02617          1    6    15
324228  40.7713 -73.9663  B02617          3    8    14
621205  40.6466 -73.7900  B02682          3   29     6
412662  40.7233 -73.9827  B02617          4   30    16
74768   40.7598 -73.9792  B02598          0    5    19
...Terminé.
[[ 4.66228609  2.58110224 -1.19571244  0.          1.          0.
   0.        ]
 [ 0.83831078  0.16558575 -0.05653184  0.          1.          0.
   0.        ]
 [-2.54600235  3.52671879 -0.05653184  0.          0.          1.
   0.        ]
 [-0.46439195 -0.14707779  0.51305846  0.          1.          0.
   0.        ]
 [ 0.52620492 -0.08035081 -1.76530274  1.          0.          0.
   0.        ]]



### Let's start using DBSCAN, import the module and fit DBSCAN

In [304]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.2, min_samples=100, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.2, metric='manhattan', min_samples=100)

### Find out how many clusters DBSCAN created

In [305]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9])

In [306]:
data_sample["cluster"] = db.labels_
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour,cluster
316387,40.9122,-73.8396,B02617,1,6,15,-1
324228,40.7713,-73.9663,B02617,3,8,14,-1
621205,40.6466,-73.79,B02682,3,29,6,-1
412662,40.7233,-73.9827,B02617,4,30,16,-1
74768,40.7598,-73.9792,B02598,0,5,19,-1


In [307]:
data_sample['cluster'].value_counts()

-1    16341
 1     1016
 4      666
 2      428
 0      300
 5      260
 7      259
 3      220
 6      220
 8      149
 9      141
Name: cluster, dtype: int64

In [308]:
fig = px.scatter_mapbox(
        data_sample[data_sample.cluster != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster",
        mapbox_style="carto-positron"
)

fig.show()

In [309]:
px.scatter_mapbox(
    data_sample.loc[data_sample.cluster != -1, :],
    lat="Lat",
    lon="Lon",
    color="DayOfWeek",
    mapbox_style="carto-positron"
)

In [310]:
numeric_features = ["Lat", "Lon", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# Création du transformer pour les variables catégorielles
categorical_features = ["Base"] # Positions des colonnes catégorielles dans X
categorical_transformer = OneHotEncoder(drop='first')

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon    Base  DayOfWeek  Day  Hour  cluster
316387  40.9122 -73.8396  B02617          1    6    15       -1
324228  40.7713 -73.9663  B02617          3    8    14       -1
621205  40.6466 -73.7900  B02682          3   29     6       -1
412662  40.7233 -73.9827  B02617          4   30    16       -1
74768   40.7598 -73.9792  B02598          0    5    19       -1
...Terminé.
[[ 4.66228609  2.58110224  0.0834814   0.          1.          0.
   0.        ]
 [ 0.83831078  0.16558575 -0.08894739  0.          1.          0.
   0.        ]
 [-2.54600235  3.52671879 -1.46837771  0.          0.          1.
   0.        ]
 [-0.46439195 -0.14707779  0.25591019  0.          1.          0.
   0.        ]
 [ 0.52620492 -0.08035081  0.77319656  1.          0.          0.
   0.        ]]



In [311]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9])

In [312]:
data_sample["cluster_2"] = db.labels_
data_sample.head()

Unnamed: 0,Lat,Lon,Base,DayOfWeek,Day,Hour,cluster,cluster_2
316387,40.9122,-73.8396,B02617,1,6,15,-1,-1
324228,40.7713,-73.9663,B02617,3,8,14,-1,-1
621205,40.6466,-73.79,B02682,3,29,6,-1,-1
412662,40.7233,-73.9827,B02617,4,30,16,-1,-1
74768,40.7598,-73.9792,B02598,0,5,19,-1,-1


In [313]:
px.scatter_mapbox(
    data_sample.loc[data_sample.cluster != -1, :],
    lat="Lat",
    lon="Lon",
    color="Hour",
    mapbox_style="carto-positron"
)