# Find best places to pick up customers in New-York City
This projet is for Uber, taxi or all Vehicule Fore Hire drivers.
This app allows drivers to target the best pickup areas in New-York City according to the date and the time.

The user picks a date and some hours on the app which shows where are the areas with a lot of customers.

---

*Currently, this application only works with the month of April and the machine learning model is based on data from April 2014.*

**New version:**
- user can enters a list of hours in order to analyse the density evolution of customers in New-York City's boroughs

**Improvements:**
- add other months 
- deploy application with Flask

##### Step 1:  Import libraires and data

In [1]:
# Import librairies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap

from sklearn.cluster import KMeans
import random

from sklearn.metrics import silhouette_score

In [2]:
# Importer dataset
data_apr = pd.read_csv('uber-raw-data-apr14.csv')

#zone_lookup = pd.read_csv('taxi-zone-lookup.csv')
#zone_lookup.drop([55, 102, 103, 263], inplace=True)

##### Step 2: Define all the needed functions

In [3]:
# Choose a date and hour
def choose_date_hour(date, hour):
    
    # Original dataset
    data_apr.head(5)

    # Split Time & Date
    data_apr2 = np.array([data_apr['Date/Time'].str.split(' ')])
    data_apr2 = pd.DataFrame(data_apr2.reshape(data_apr2.shape[1], data_apr2.shape[2]),
                             columns=['Date', 'Time'])

    # PSplit Date
    data_apr3 = np.array([data_apr2['Date'].str.split('/')])
    data_apr3 = pd.DataFrame(data_apr3.reshape(data_apr3.shape[1], data_apr3.shape[2]),
                             columns=['Month', 'Day', 'Year']).astype(int)

    # Concate all intermediate datasets
    data_apr4 = pd.concat([data_apr.drop(['Date/Time'], axis=1), data_apr2.drop(['Date'],
                                                                                axis=1), data_apr3], axis=1)

    # Useless here
    #data_apr4['Time'] = pd.to_datetime(data_apr4['Time'],format= '%H:%M:%S' ).dt.time

    # Split dataset according to days in month
    data_apr_day = [frame for data_apr_day,  frame in data_apr4.groupby('Day')]
    # data_apr_day[3] = 4th day of April
    
    # April begins by 1 but index by 0
    data_apr_day[date-1]
    
    # Clean dataset
    data_apr_day_X = pd.DataFrame(data_apr_day[date-1].drop(['Base', 'Month', 'Year'], axis=1))
    data_apr_day_X = data_apr_day_X.reset_index(drop=True)
    
    # Split Time
    data_apr_day_X_2 = np.array([data_apr_day_X['Time'].str.split(':')])
    data_apr_day_X_3 = pd.DataFrame(data_apr_day_X_2.reshape(data_apr_day_X_2.shape[1],
                                                             data_apr_day_X_2.shape[2]),
                                    columns=['Hours', 'Minutes', 'Seconds']).astype(int)


    # Concate all intermediate datasets
    data_apr_day_X_4 = pd.concat([data_apr_day_X.drop(['Time'], axis=1), data_apr_day_X_3], axis=1)
    
    # Split dataset according to hours in day
    data_apr_day_X_hours = [frame for data_apr_day_X_hours,  frame in data_apr_day_X_4.groupby('Hours')]
         
    # new dataset, ready to shine
    data_apr_day_X_hours[hour] = pd.DataFrame(data_apr_day_X_hours[hour].drop(['Seconds'], axis=1))
    
    return data_apr_day_X_hours[hour]

In [4]:
# Normalize dataset
def normalize(X):

    scaler = StandardScaler()
    dataset_scaled = scaler.fit_transform(X.iloc[:,0:2])
    
    return dataset_scaled

In [6]:
# Search the best number of KMeans clusters by Silhouette method
def silhouette_method(dataset_scaled):
    
    silhouette_list_kmeans = []
    
    # 15 is a subjective number. Search the number when the Elbow curve decrease would be the best way
    for i in range(2,15):
        kmeans_dataset_silhouette = KMeans(n_clusters=i, max_iter=300, n_init=10, n_jobs=-1, random_state=0)
        kmeans_dataset_silhouette.fit(dataset_scaled[:,0:2])
        silhouette_list_kmeans.append(silhouette_score(dataset_scaled[:,0:2], kmeans_dataset_silhouette.labels_))
    
    # Find the best number of clusters
    index_best_nb_clusters_silhouette = np.where(silhouette_list_kmeans[1:15] == max(silhouette_list_kmeans[1:15]))

    # 1 add 3 because :
    # - silhouette_list begins at 2 clusters => +2
    # - I dont keep the 1st index in silhouette_list because it's wrong => +1
    best_n_clusters_silhouette = index_best_nb_clusters_silhouette[0][0]+3
    
    # Train the final model
    kmeans_dataset_best = KMeans(n_clusters=best_n_clusters_silhouette, max_iter=300,
                                 n_init=10, n_jobs=-1, random_state=0)
    kmeans_dataset_best.fit(dataset_scaled[:,0:2])
    dataset_scaled_clusters_kmeans_best = kmeans_dataset_best.predict(dataset_scaled[:,0:2])
    
    return kmeans_dataset_best, best_n_clusters_silhouette

In [7]:
# Display all KMeans clusters on GMap
def display_cluster_gmap(date, hour, best_n_clusters_silhouette, dataset, kmeans_dataset_best):
    
    X = dataset[['Lat','Lon']]
    
    cluster_X_kmeans = list()

    #output_file("gmap.html")

    map_options = GMapOptions(lat=40.7128, lng=-74.0060, map_type="roadmap", zoom=10)
    p = gmap("AIzaSyB-E81VWnuGE2A9iXgHHI3lz5ZOvbRIN3A",
             map_options,
             title="April {}, 2014 from {}H00 to {}H59, number of clusters: {}".format(date, 
                                                                                       hour, 
                                                                                       hour, 
                                                                                       best_n_clusters_silhouette))
    
    for i in range(best_n_clusters_silhouette):
        cluster_X_kmeans.append(ColumnDataSource(data=dict(lat=X[kmeans_dataset_best.labels_== i].iloc[:,0],
                                                    lon=X[kmeans_dataset_best.labels_== i].iloc[:,1])))
    
        couleurs = ["#"+''.join([random.choice('0123456789ABCDEF') for i in range(6)])
                    for j in range(best_n_clusters_silhouette)]

        p.circle(x="lon", y="lat", size=6, color=couleurs[i], fill_alpha=1, source=cluster_X_kmeans[i])   

    show(p)

##### Step 3:  Create application

In [8]:
# User chooses a date
date = int(input("Choose a date between 1 and 30: "))

# user chooses the number of hours
nb_hours = int(input("How many hours do you want to analyze? "))

list_hour = list()

for i in range(nb_hours):
    hour = int(input("Choose a hour between 0 and 23: "))
    list_hour.append(hour)
    
for hour in list_hour:

    # Dataset created
    dataset = choose_date_hour(date, hour)

    # Dataset scaled
    dataset_scaled = normalize(dataset)

    # Parameters of KMeans models created
    kmeans_dataset_best, best_n_clusters_silhouette = silhouette_method(dataset_scaled)

    # Map and clusters display
    display_cluster_gmap(date, hour, best_n_clusters_silhouette, dataset, kmeans_dataset_best)

Choose a date between 1 and 30: 2
How many hours do you want to analyze? 3
Choose a hour between 0 and 23: 0
Choose a hour between 0 and 23: 10
Choose a hour between 0 and 23: 17
