In [1]:
import plotly.io as pio
import pandas as pd 
import numpy as np
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

## EDA

In [2]:
df = pd.read_csv('uber-raw-data-apr14.csv')

In [3]:

# Basic stats
print("Number of rows : {}".format(df.shape[0]))
print("Number of columns : {}".format(len(df.columns)))
print()

print("Display of dataset: ")
display(df.head())
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

Number of rows : 564516
Number of columns : 4

Display of dataset: 


Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512



Basics statistics: 


Unnamed: 0,Date/Time,Lat,Lon,Base
count,564516,564516.0,564516.0,564516
unique,41999,,,5
top,4/7/2014 20:21:00,,,B02682
freq,97,,,227808
mean,,40.740005,-73.976817,
std,,0.036083,0.050426,
min,,40.0729,-74.7733,
25%,,40.7225,-73.9977,
50%,,40.7425,-73.9848,
75%,,40.7607,-73.97,



Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

## Proprocessing

In [4]:
# date format update
df['Date/Time'] = pd.to_datetime(df['Date/Time'], format='%m/%d/%Y %H:%M:%S')

df['dayofweek'] = df['Date/Time'].dt.dayofweek
df['hour'] = df['Date/Time'].dt.hour

del df['Date/Time']
del df['Base']

df.head()

Unnamed: 0,Lat,Lon,dayofweek,hour
0,40.769,-73.9549,1,0
1,40.7267,-74.0345,1,0
2,40.7316,-73.9873,1,0
3,40.7588,-73.9776,1,0
4,40.7594,-73.9722,1,0


In [5]:
# we keep juste one our in a specific day for our test
df_sample = df[(df['hour']== 16) & (df['dayofweek']== 4)]
df_sample.head()

Unnamed: 0,Lat,Lon,dayofweek,hour
4655,40.7404,-73.9961,4,16
4656,40.774,-73.8715,4,16
4657,40.7502,-73.9733,4,16
4658,40.7446,-73.9852,4,16
4659,40.7574,-73.9691,4,16


In [6]:
scaler = StandardScaler()
X = scaler.fit_transform(df_sample)

## Kmean

In [7]:
wcss =  []
k = []
for i in range (1,11): 
    kmeans = KMeans(n_clusters= i, random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    k.append(i)
    print("WCSS for K={} --> {}".format(i, wcss[-1]))



WCSS for K=1 --> 12518.000000000002




WCSS for K=2 --> 8301.830891324567
WCSS for K=3 --> 5450.825507432668




WCSS for K=4 --> 4380.997907001366
WCSS for K=5 --> 3443.93331192659




WCSS for K=6 --> 2817.5189367561793




WCSS for K=7 --> 2426.279795532438




WCSS for K=8 --> 2001.3376915412628




WCSS for K=9 --> 1775.747852264773




WCSS for K=10 --> 1562.0072216857852


In [8]:
# Let's visualize using plotly
import plotly.express as px

# Create DataFrame
wcss_frame = pd.DataFrame(wcss)
k_frame = pd.Series(k)

# Create figure
fig= px.line(
    wcss_frame,
    x=k_frame,
    y=wcss_frame.iloc[:,-1]
)

# Create title and axis labels
fig.update_layout(
    yaxis_title="Inertia",
    xaxis_title="# Clusters",
    title="Inertia per cluster"
)

fig.show() 

In [9]:
# Import silhouette score
from sklearn.metrics import silhouette_score

# Computer mean silhouette score
sil = []
k = []

for i in range (2,11): 
    kmeans = KMeans(n_clusters= i, random_state = 0)
    kmeans.fit(X)
    sil.append(silhouette_score(X, kmeans.predict(X)))
    k.append(i)
    print("Silhouette score for K={} is {}".format(i, sil[-1]))





Silhouette score for K=2 is 0.766244556556993






Silhouette score for K=3 is 0.4511715915585703






Silhouette score for K=4 is 0.5054879930219334






Silhouette score for K=5 is 0.4729080141391269






Silhouette score for K=6 is 0.4898110781940099






Silhouette score for K=7 is 0.41428237324746386






Silhouette score for K=8 is 0.426271757825489






Silhouette score for K=9 is 0.44206379900183546






Silhouette score for K=10 is 0.44375506332002007


In [10]:
# Create a data frame 
cluster_scores=pd.DataFrame(sil)
k_frame = pd.Series(k)

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=k, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

# Render
#fig.show(renderer="notebook")
fig.show() # if using workspace

In [11]:
kmeans = KMeans(n_clusters=6, random_state=0)

# Fit kmeans to our dataset
kmeans.fit(X)





In [12]:
df_sample['cluster_km'] = kmeans.labels_



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [13]:
# Visualisation on a map

fig = px.scatter_mapbox(df_sample[df_sample['cluster_km'] >= 0], animation_frame ='dayofweek', lat="Lat", lon="Lon",color="cluster_km", zoom=10, mapbox_style="carto-positron")
fig.show()

It's not really easy to understand this map, it cuts NY in the middle of Manhattan. Let's test with DBSCAN 

## DBScan

In [15]:
#let's focus on Wednesday
df_wen = df[df['dayofweek']== 2] 

# creat à dataset with only the usefull information for the dbscan
df_wen_db = df_wen[['Lat','Lon','hour']]

In [16]:
scaler = StandardScaler()
X_wen = scaler.fit_transform(df_wen_db)

In [17]:
db = DBSCAN(eps = 0.3, min_samples=90 , metric = "manhattan") 
db.fit(X_wen)

In [18]:
df_wen_db['cluster'] = db.labels_

fig = px.scatter_mapbox(df_wen_db[df_wen_db['cluster'] >= 0], animation_frame ='hour', lat="Lat", lon="Lon",color="cluster", zoom=10, mapbox_style="carto-positron")
fig.show()

We can see really interesting a cluster at the beginning of the day (7 to 9am) around an airport at the north.
They then come back at the end of the day. It's a really usefull piece of information for cabs to know when they have to focus on Manhattan or the airport. 

We can see the same phenomenon around Brooklyn & Williamsburg at the beginning and at the end of the day. 
I'm quite sure that these are living areas and not working areas, so people go to work in the morning and go back at the end of day. 