# Uber pickups
The objective of this project is to create algorithms that will determine where are the hot-zones that uber drivers should be in.

This was done for the city of New-York with data from april 2014.

In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, MiniBatchKMeans
from sklearn.metrics import  silhouette_score
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import plotly.express as px
import plotly.graph_objects as go


In [5]:
data = pd.read_csv("uber-raw-data-apr14.csv")

In [8]:
print(data.shape[0])
display(data.head())
display(data.describe(include='all'))
print(data.info())

564516


Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


Unnamed: 0,Date/Time,Lat,Lon,Base
count,564516,564516.0,564516.0,564516
unique,41999,,,5
top,4/7/2014 20:21:00,,,B02682
freq,97,,,227808
mean,,40.740005,-73.976817,
std,,0.036083,0.050426,
min,,40.0729,-74.7733,
25%,,40.7225,-73.9977,
50%,,40.7425,-73.9848,
75%,,40.7607,-73.97,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564516 entries, 0 to 564515
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date/Time  564516 non-null  object 
 1   Lat        564516 non-null  float64
 2   Lon        564516 non-null  float64
 3   Base       564516 non-null  object 
dtypes: float64(2), object(2)
memory usage: 17.2+ MB
None


It seems that we have no missing data ! However, we need to transform dates into datetime objects.

In [10]:
data['Date/Time'] = pd.to_datetime(data['Date/Time'])

### To find hot spots in New-york at different times, I will be looking at two different clustering methods: Kmeans and DBscan.

### K-mean hyperparameter optimization

In [15]:
sample = int(.15*data.shape[0])
data_sample = data.sample(sample, random_state=0)
data_sample['hour'] = data_sample['Date/Time'].dt.hour
X = data_sample.drop(["Date/Time", "Base"], axis=1)

Optimal number of clusters here is estimted using the elbow and silouhette methods for a sample of the original data. I am aware that the result may vary with each run, but this methodology will be used nonetheless to reduce calculation time. 

Estimation was made using a sample 85000 observations, or about 15% of the dataset.

In [16]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [17]:
wcss_list = []
sil_list = []
k_list = []
for k in range(2,10+1):
    kmeans = MiniBatchKMeans(n_clusters = k, random_state = 0, n_init='auto')
    kmeans.fit(X)
    wcss_list.append(kmeans.inertia_)
    sil_list.append(silhouette_score(X, kmeans.predict(X)))
    k_list.append(k)
    
k_choice = pd.DataFrame({'k':k_list, 'WCSS':wcss_list, 'Silhouette Score':sil_list}).set_index('k')
k_choice

Unnamed: 0_level_0,WCSS,Silhouette Score
k,Unnamed: 1_level_1,Unnamed: 2_level_1
2,192555.246149,0.350027
3,146307.059632,0.382319
4,120002.174832,0.329177
5,106684.724558,0.30406
6,98972.850932,0.269287
7,93871.365007,0.234114
8,80417.271402,0.26947
9,69141.659251,0.283979
10,65109.037947,0.279612


In [18]:
figs = go.Figure()

#fig.layout.update(yaxis2 = go.YAxis(overlaying='y', side='right'))
figs.add_trace(go.Scatter(
    x = k_choice.index, 
    y = k_choice['WCSS'], 
    name = 'WCSS', 
    mode = 'lines', 
    xaxis = 'x', 
    yaxis = 'y',
))

figs.add_trace(go.Bar(
    x = k_choice.index, 
    y = k_choice['Silhouette Score'], 
    name = 'Silhouette Score', 
    xaxis = 'x', 
    yaxis = 'y2',
    opacity = 0.7
))

figs.update_layout(
    xaxis = go.layout.XAxis(title_text = 'K', tickvals = k_choice.index),
    yaxis = go.layout.YAxis(title_text = 'WCSS', color = 'blue'),
    yaxis2 = go.layout.YAxis(title_text = 'Silhouette Score', anchor = 'x', side = 'right', overlaying = 'y', color = 'red'),
    showlegend = True,
    legend_xanchor = 'left',
    legend_x = 1.2,
    legend_font_size = 14,
    autosize = False,
    title_text = 'Choice of optimal K value',
    title_x = 0.5,
    height = 500, 
    width = 900
)

figs.show()

The best number of clusters is the one with le lowest WCSS and the highest Slihouette score, here 9 clusters seems to be optimal

In [19]:
kmeans = KMeans(n_clusters = 9, random_state = 0, n_init=10)
kmeans.fit(X)

In [20]:
data_sample.loc[:,'clusters'] = kmeans.predict(X)
data_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,hour,clusters
63031,2014-04-06 05:53:00,40.78,-73.9486,B02598,5,5
100482,2014-04-13 11:56:00,40.7495,-73.9917,B02598,11,6
239000,2014-04-05 23:41:00,40.7475,-74.0089,B02617,23,0
216098,2014-04-30 20:09:00,40.7555,-73.9917,B02598,20,2
160769,2014-04-24 13:06:00,40.645,-73.7819,B02598,13,3


In [21]:
fig = px.scatter_mapbox(data_sample, lat='Lat', lon='Lon', color='clusters')

fig.update_layout(mapbox_style="carto-positron",
                  margin={"r": 5, "t": 5, "l": 5, "b": 5},
                  autosize=False,
                  width=1000,
                  height=800,
                  mapbox_zoom=10)


fig.show()

K means clustering displays interesting results. We can clearly see a rough seperation by neigborhoods such as Manhattan, The Bronx and Brooklyn. Other clusters overlap with each other since the hour of the pickups was added as a third feature for clustering. The purple and dark yellow clusters for example correspond to the same neigborhoods but at different hours of the day.

The only drawback to this clustering method is that every point of data is included which might affect the coordenates of the cluster centroids. 

# DBSCAN

With DBscan there is no automated hyperparameter optimization. Therefore, we need to chose the parameters by hand. 

For this case study, the best kind of clustering would be one with dense clusters but and a low sensitivity to outliers. 

Dense clusters to really locate important zones for Uber drivers, and low outlier sensitivity so as to not ignore the outskirts of the city.

Thiswould be achieved with high values for both epsilon and min_samples

In [22]:
db = DBSCAN(eps=0.4, min_samples=100, metric="manhattan")
db.fit(X)
data_sample.loc[:,'dbclusters'] = db.labels_

fig = px.scatter_mapbox(data_sample[data_sample.dbclusters != -1], lat='Lat', lon='Lon', color='dbclusters')

fig.update_layout(mapbox_style="carto-positron",
                  margin={"r": 5, "t": 5, "l": 5, "b": 5},
                  autosize=False,
                  width=1000,
                  height=800,
                  mapbox_zoom=10)


fig.show()

We can see that we obtain four clusters, a massive one for manhattan and its surroundings, and three smaller ones near the International John F. Kennedy airport, the Newark Liberty international airport and the La Guardia airport

Although DBscan does manage to cluster important locations, there are two important drawbacks to this method:

- First, the central cluster is too big. Changing the hyperparameters only leads to too many clusters or a single one in central manhattan. Given the high traffic area in which this cluster is located and the size of the cluster a driver in the centroid of this cluster would take too much time to reach the outermost points such as the ones in central Brooklyn. It would be preferable to at least dispatch uber drivers between uptown and downtown manhattan. 

- Second DBscan does not retrun centroid coordinates, and although these can be easily calculted for euclidian distances, for latitudes and logitudes the task is more complex.

Given these results I believe k-means is better suited for this case study, and will be used in the following.

# Clustering visualization per weekday

First I will retrieve cluster centroid coordenates for each weekday

In [33]:
pd.set_option('chained_assignment', None) # to remove copy warnings

data['day'] = data['Date/Time'].dt.weekday
data['hour'] = data['Date/Time'].dt.hour
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_data = {}
plot_data = {}
# This loop will calculate k-means clusters for each weekday, retrieve the centroids and put them in a dataframe
for d in range(7): 
    weekday_data[weekday_names[d]] = data[data['day'] == d]
    X = weekday_data[weekday_names[d]].drop(["Date/Time", "Base", "day"], axis=1).reset_index(drop=True)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    kmeans = KMeans(n_clusters = 9, random_state = 0, n_init=10)
    kmeans.fit(X)
    weekday_data[weekday_names[d]]['cluster']= kmeans.predict(X)
    centroids = scaler.inverse_transform(kmeans.cluster_centers_[kmeans.predict(X)])
    weekday_data[weekday_names[d]].loc[:,'centroid_lat'], weekday_data[weekday_names[d]].loc[:,'centroid_lon'] = centroids[:,0], centroids[:,1]
    lat = weekday_data[weekday_names[d]]['centroid_lat'].value_counts().index
    lon = weekday_data[weekday_names[d]]['centroid_lon'].value_counts().index
    count = weekday_data[weekday_names[d]]['centroid_lon'].value_counts().values
    plot_df = pd.DataFrame(data={'lat' : lat, 'lon' : lon, 'count' : count})
    plot_data[weekday_names[d]] = plot_df

Then I will arbitrarily class the clusters by the number of requests. This will give me a proxy of priority: many requests in a hotspot means more priority !

In [34]:
def daily_requests(x):
    if x > 20000: 
        return 30
    if x < 10000:
        return 10
    else : 
        return 20
    
for key, _ in plot_data.items():
    plot_data[key]['requests'] = plot_data[key]['count'].apply(daily_requests)
    plot_data[key]['request_nb'] = plot_data[key]['requests'].apply(lambda x : "High" if x == 30 else "Normal" if x == 20 else "Low")


Now I can plot The horspots for uber drivers. In this map, drivers can choose the day of the week to see where they should position themselves for optimal pickup speed

In [35]:
colors = sns.color_palette("hls", 7).as_hex()
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
fig = go.Figure()
for i, d in enumerate(weekday_names):
        fig.add_trace(go.Scattermapbox(
              lat=plot_data[d]['lat'], 
              lon=plot_data[d]['lon'],
              mode='markers',
              name = d,
              text=plot_data[d]['request_nb'],
              hoverinfo='text',
              hovertemplate="Number of requests : %{text}",
              marker=go.scattermapbox.Marker(
              size=plot_data[d]['requests'],
                      color=colors[i])))
fig.update_layout(mapbox_style="carto-positron",
                  title=dict(text="Uber pickup hotspots per weekday", 
                             font=dict(size=40)),
                  legend_title="Weekday",
                  autosize=False,
                  width=1000,
                  height=800,
                  mapbox_zoom=10,
                  mapbox=dict(
                      center=dict(
                          lat=40.730610,
                          lon=-73.935242)
                  ))
fig.show()

However, it would be nice to have a more detailed information by the hour, once we have chosen the day that suits us

In [37]:
data['day'] = data['Date/Time'].dt.weekday
data['hour'] = data['Date/Time'].dt.hour
day = data[data['day'] == 1]

weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
hours = {}
X = {}
plot_data={}
for h in range(24): 
    hours[h] = day[day['hour'] == h]\
        .drop(["Date/Time", "Base", "hour"], axis=1)\
        .reset_index(drop=True)
    X[h] = hours[h]
scaler = StandardScaler()
for key, value in X.items():

    X[key] = scaler.fit_transform(value)
    kmeans = KMeans(n_clusters = 9, random_state = 0, n_init=10)
    kmeans.fit(X[key])
    hours[key]['cluster']= kmeans.predict(X[key])
    centroids = scaler.inverse_transform(kmeans.cluster_centers_[kmeans.predict(X[key])])
    hours[key]['centroid_lat'], hours[key]['centroid_lon'] = centroids[:,0], centroids[:,1]
    lat = hours[key]['centroid_lat'].value_counts().index
    lon = hours[key]['centroid_lon'].value_counts().index
    count = hours[key]['centroid_lon'].value_counts().values
    plot_df = pd.DataFrame(data={'lat' : lat, 'lon' : lon, 'count' : count})
    plot_data[key] = plot_df


In [38]:
def hourly_requests(x):
    if x > 1000: 
        return 30
    if x < 100:
        return 10
    else : 
        return 20
    
for key, _ in plot_data.items():
    plot_data[key]['requests'] = plot_data[key]['count'].apply(hourly_requests)
    plot_data[key]['request_nb'] = plot_data[key]['requests'].apply(lambda x : "High" if x == 30 else "Normal" if x == 20 else "Low")

In [39]:
colors = sns.color_palette("flare", 24).as_hex()
hour_names = [str(x) + 'h' for x in list(hours.keys())]

fig = go.Figure()

for h in range(24):
        fig.add_trace(go.Scattermapbox(
              lat=plot_data[h]['lat'], 
              lon=plot_data[h]['lon'],
              mode='markers',
              name = hour_names[h],
              text=plot_data[h]['request_nb'],
              hoverinfo='text',
              hovertemplate="Number of requests : %{text}",
              marker=go.scattermapbox.Marker(
              size=plot_data[h]['requests'],
                      color=colors[h])))

fig.update_layout(mapbox_style="carto-positron",
                  title=dict(text=f"Hourly Uber pickup hotspots on {weekday_names[1]}s", 
                             font=dict(size=40)),
                  legend_title="Hour",
                  autosize=False,
                  width=1000,
                  height=800,
                  mapbox_zoom=10,
                  mapbox=dict(
                      center=dict(
                          lat=40.730610,
                          lon=-73.935242)
                  ))
fig.show()

Now this is very useful information. But changing the day of the week in the code is not very user friendly. Therefore I need to generalize the approach

## Generalize the approach

All of the code above is integrated into a class Uber with several methods 

In [41]:
class Uber():

    def __init__(self,data):    # Initialize class and it's attributes
        self.data = data
        # Convert column to date-time object
        self.data['Date/Time'] = pd.to_datetime(self.data['Date/Time'])
        # Calculate sample size as 10% of the initial dataset
        self.sample = int(.15*data.shape[0])
        # Sample the dataset for optimal K calculation
        self.data_sample = data.sample(self.sample, random_state=0)

        
    def hyperparameter(self):  # This method allows for selection on the ideal number of clusters
        #Create hour column
        self.data['hour'] = self.data_sample['Date/Time'].dt.hour
        # Drop unecessary columns
        X = self.data_sample.drop(["Date/Time", "Base"], axis=1)
        # Scale data
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        # Calculate whithin cluster sum square and silouhette scores for different kvalues of k
        wcss_list = []
        sil_list = []
        k_list = []
        for k in range(2,10+1):
            kmeans = MiniBatchKMeans(n_clusters = k, random_state = 0, n_init='auto')
            kmeans.fit(X)
            wcss_list.append(kmeans.inertia_)
            sil_list.append(silhouette_score(X, kmeans.predict(X)))
            k_list.append(k)

        k_choice = pd.DataFrame({'k':k_list, 'WCSS':wcss_list, 'Silhouette Score':sil_list}).set_index('k')
        
        # Plot the results 
        figs = go.Figure()

        figs.add_trace(go.Scatter(
            x = k_choice.index, 
            y = k_choice['WCSS'], 
            name = 'WCSS', 
            mode = 'lines', 
            xaxis = 'x', 
            yaxis = 'y',
        ))
        
        figs.add_trace(go.Bar(
            x = k_choice.index, 
            y = k_choice['Silhouette Score'], 
            name = 'Silhouette Score', 
            xaxis = 'x', 
            yaxis = 'y2',
            opacity = 0.7
        ))
        
        figs.update_layout(
            xaxis = go.layout.XAxis(title_text = 'K', tickvals = k_choice.index),
            yaxis = go.layout.YAxis(title_text = 'WCSS', color = 'blue'),
            yaxis2 = go.layout.YAxis(title_text = 'Silhouette Score', anchor = 'x', side = 'right', overlaying = 'y', color = 'red'),
            showlegend = True,
            legend_xanchor = 'left',
            legend_x = 1.2,
            legend_font_size = 14,
            autosize = False,
            title_text = 'Choice of optimal K value',
            title_x = 0.5,
            height = 500, 
            width = 900
        )
        figs.show()

    def daily_requests(self, x): # Defines sizes for points to be plotted in the daily_hotspots method
        if x > 20000: 
            return 30
        if x < 10000:
            return 10
        else : 
            return 20
        

    def daily_hotspots(self, k): # This method clusters data for each weekday with k-means
        pd.set_option('chained_assignment', None) # to remove copy warnings

        self.data['day'] = self.data['Date/Time'].dt.weekday
        self.data['hour'] = self.data['Date/Time'].dt.hour

        weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        weekday_data = {}
        plot_data = {}

        # This loop will calculate k-means clusters for each weekday, retrieve the centroids and put them in a dataframe
        for d in range(7): 
            weekday_data[weekday_names[d]] = self.data[self.data['day'] == d]

            X = weekday_data[weekday_names[d]].drop(["Date/Time", "Base", "day"], axis=1).reset_index(drop=True)
            scaler = StandardScaler()
            X = scaler.fit_transform(X)

            kmeans = KMeans(n_clusters = k, random_state = 0, n_init=10)
            kmeans.fit(X)
            weekday_data[weekday_names[d]]['cluster']= kmeans.predict(X)

            centroids = scaler.inverse_transform(kmeans.cluster_centers_[kmeans.predict(X)])
            weekday_data[weekday_names[d]].loc[:,'centroid_lat'], weekday_data[weekday_names[d]].loc[:,'centroid_lon'] = centroids[:,0], centroids[:,1]

            lat = weekday_data[weekday_names[d]]['centroid_lat'].value_counts().index
            lon = weekday_data[weekday_names[d]]['centroid_lon'].value_counts().index
            count = weekday_data[weekday_names[d]]['centroid_lon'].value_counts().values
            plot_df = pd.DataFrame(data={'lat' : lat, 'lon' : lon, 'count' : count})
            plot_data[weekday_names[d]] = plot_df

        for key, _ in plot_data.items():
            plot_data[key]['requests'] = plot_data[key]['count'].apply(self.daily_requests)
            plot_data[key]['request_nb'] = plot_data[key]['requests'].apply(lambda x : "High" if x == 30 else "Normal" if x == 20 else "Low")
        
        colors = sns.color_palette("hls", 7).as_hex()
        weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

        fig = go.Figure()

        for i, d in enumerate(weekday_names):
                fig.add_trace(go.Scattermapbox(
                      lat=plot_data[d]['lat'], 
                      lon=plot_data[d]['lon'],
                      mode='markers',
                      name = d,
                      text=plot_data[d]['request_nb'],
                      hoverinfo='text',
                      hovertemplate="Number of requests : %{text}",
                      marker=go.scattermapbox.Marker(
                      size=plot_data[d]['requests'],
                              color=colors[i])))

        fig.update_layout(mapbox_style="carto-positron",
                          title=dict(text="Uber pickup hotspots per weekday", 
                                     font=dict(size=40)),
                          legend_title="Weekday",
                          autosize=False,
                          width=1000,
                          height=800,
                          mapbox_zoom=10,
                          mapbox=dict(
                              center=dict(
                                  lat=40.730610,
                                  lon=-73.935242)
                          ))
        fig.show()


    def hourly_requests(self, x): # Defines sizes for points to be plotted in the hourly_hotspots method
        if x > 1000: 
            return 30
        if x < 100:
            return 10
        else : 
            return 20
        
    def hourly_hotspots(self, k, day_of_week=0):

        self.data['day'] = self.data['Date/Time'].dt.weekday
        self.data['hour'] = self.data['Date/Time'].dt.hour
        day = self.data[self.data['day'] == day_of_week]

        weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        hours = {}
        X = {}
        plot_data={}

        for h in range(24): 

            hours[h] = day[day['hour'] == h]\
                .drop(["Date/Time", "Base", "hour"], axis=1)\
                    .reset_index(drop=True)
            X[h] = hours[h]

        scaler = StandardScaler()
        for key, value in X.items():
        
            X[key] = scaler.fit_transform(value)

            kmeans = KMeans(n_clusters = k, random_state = 0, n_init=10)
            kmeans.fit(X[key])
            hours[key]['cluster']= kmeans.predict(X[key])

            centroids = scaler.inverse_transform(kmeans.cluster_centers_[kmeans.predict(X[key])])
            hours[key]['centroid_lat'], hours[key]['centroid_lon'] = centroids[:,0], centroids[:,1]

            lat = hours[key]['centroid_lat'].value_counts().index
            lon = hours[key]['centroid_lon'].value_counts().index
            count = hours[key]['centroid_lon'].value_counts().values
            plot_df = pd.DataFrame(data={'lat' : lat, 'lon' : lon, 'count' : count})
            plot_data[key] = plot_df

        for key, _ in plot_data.items():
            plot_data[key]['requests'] = plot_data[key]['count'].apply(self.hourly_requests)
            plot_data[key]['request_nb'] = plot_data[key]['requests'].apply(lambda x : "High" if x == 30 else "Normal" if x == 20 else "Low")

        colors = sns.color_palette("flare", 24).as_hex()
        hour_names = [str(x) + 'h' for x in list(hours.keys())]
        
        fig = go.Figure()

        for h in range(24):
                fig.add_trace(go.Scattermapbox(
                      lat=plot_data[h]['lat'], 
                      lon=plot_data[h]['lon'],
                      mode='markers',
                      name = hour_names[h],
                      text=plot_data[h]['request_nb'],
                      hoverinfo='text',
                      hovertemplate="Number of requests : %{text}",
                      marker=go.scattermapbox.Marker(
                      size=plot_data[h]['requests'],
                              color=colors[h])))
        
        fig.update_layout(mapbox_style="carto-positron",
                          title=dict(text=f"Hourly Uber pickup hotspots on {weekday_names[day_of_week]}s", 
                                     font=dict(size=40)),
                          legend_title="Hour",
                          autosize=False,
                          width=1000,
                          height=800,
                          mapbox_zoom=10,
                          mapbox=dict(
                              center=dict(
                                  lat=40.730610,
                                  lon=-73.935242)
                          ))
        fig.show()

### Initialize Uber class with april dataset

In [42]:
april_data = pd.read_csv("uber-raw-data-apr14.csv")
april = Uber(april_data)

### Find best K for the dataset (this can take a few minutes)

In [43]:
april.hyperparameter()

We can see that the best number of clusters in this range is 7 (may vary depending on the sample taken for analysis). This is indicated by the combination of the lowest wcss score (curve) and the highest silouhette score (bars)

### Visualize results

In [44]:
april.daily_hotspots(k=7)
april.hourly_hotspots(k=7, day_of_week=0) ## day of week as an integer with monday == 0 

The Uber class is fully functional and can be redily deployed into a dashboard !