# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
import pandas as pd
import numpy as np
import folium
from scipy.spatial import distance

In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')
train

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,...,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,pickup_borough,dropoff_borough,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
0,0,0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.841610,40.712278,...,0,0,0,0,0,0.640487,queens,queens,0,0
1,1,1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,...,0,0,0,0,0,5.250670,manhattan,manhattan,1,0
2,2,2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-73.982738,40.761270,-73.991242,40.750562,...,0,0,0,0,0,0.863411,manhattan,manhattan,0,0
3,3,3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-73.987130,40.733143,-73.991567,40.758092,...,0,0,0,0,0,1.739386,manhattan,manhattan,1,0
4,4,4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,...,0,0,0,0,0,1.242218,manhattan,manhattan,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,399995,408700,408700,2013-09-01 12:39:00.000000111,8.5,2013-09-01 12:39:00,-73.986585,40.746032,-73.990865,40.724077,...,0,0,0,0,0,1.533405,manhattan,manhattan,0,1
399996,399996,408701,408701,2015-04-25 13:26:44.0000007,9.0,2015-04-25 13:26:44,-73.992882,40.742359,-73.972649,40.762318,...,0,0,0,0,0,1.738724,manhattan,manhattan,1,0
399997,399997,408702,408702,2012-10-07 19:54:35.0000001,3.0,2012-10-07 19:54:35,-73.985598,40.731558,-73.987657,40.728738,...,0,0,0,0,0,0.222679,manhattan,manhattan,1,1
399998,399998,408703,408703,2014-06-14 02:21:00.00000071,11.5,2014-06-14 02:21:00,-74.007692,40.740735,-73.988455,40.722847,...,0,0,0,0,0,1.594361,manhattan,manhattan,1,1


In [3]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [4]:
from sklearn.cluster import KMeans

In [5]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [6]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster



Wall time: 58.3 s


KMeans(n_clusters=100, n_jobs=-1)

In [7]:
#get cluster centers
centers=myKMeans.cluster_centers_

In [8]:
coordinates

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,40.721319,-73.844311,40.712278,-73.841610
1,40.711303,-74.016048,40.782004,-73.979268
2,40.761270,-73.982738,40.750562,-73.991242
3,40.733143,-73.987130,40.758092,-73.991567
4,40.768008,-73.968095,40.783762,-73.956655
...,...,...,...,...
399995,40.746032,-73.986585,40.724077,-73.990865
399996,40.742359,-73.992882,40.762318,-73.972649
399997,40.731558,-73.985598,40.728738,-73.987657
399998,40.740735,-74.007692,40.722847,-73.988455


In [9]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [10]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [18]:

def show_cluster(cluster_number,predict):
    #erstelle eine Karte:
    cluster_map2 = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    
    #print the cluster center of the start and end point:
    folium.CircleMarker([centers[cluster_number,0], centers[cluster_number,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map2)
    folium.CircleMarker([centers[cluster_number,2], centers[cluster_number,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map2)
    folium.PolyLine([ [centers[cluster_number,0],centers[cluster_number,1]] , [centers[cluster_number,2],centers[cluster_number,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map2)
    
    #print the coordinates of the different pick ups and drop offs
    for i in range(len(myKMeans_predict)):
        if predict[i]==cluster_number:
            folium.CircleMarker([coordinates.iloc[i]['pickup_latitude'], coordinates.iloc[i]['pickup_longitude']], radius=1,                
                                color="green", 
                                fill_opacity=0.9
                               ).add_to(cluster_map2)
            folium.CircleMarker([coordinates.iloc[i]['dropoff_latitude'], coordinates.iloc[i]['dropoff_longitude']], radius=1,                
                                color="red", 
                                fill_opacity=0.9
                               ).add_to(cluster_map2)
            #folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map2)
    return cluster_map2

In [30]:
#predicts all data to a cluster center
myKMeans_predict = myKMeans.predict(coordinates)
myKMeans_predict

array([ 4, 58, 13, ..., 53, 94, 87])

In [None]:
#Shows all points of a cluster center
center_number =7
mymap = show_cluster(center_number,myKMeans_predict)
##Show map
mymap

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [52]:
def cluster_var(predict, k):
    #transform
    coordinates_np = coordinates.to_numpy()
    
     #compute min and max coordinates for normalization
    min_coord_dist = np.min(coordinates_np, axis =0)
    max_coord_dist = np.max(coordinates_np, axis =0)
    dist = max_coord_dist-min_coord_dist
    print("Maximal Distnaz:", dist)
    
    varianz_all = np.zeros((k,4))
    
    #compute the varinaz for the whole cluster
    for i in range(k):
        cluster_coordinates = coordinates_np[predict==i]   #all vars with predict =i will be written in cluster coordinates
        #compute the varianz for 1 cluster, in cluster coordinates are all the Koordinates for one cluster
        varianz = np.var(cluster_coordinates, axis = 0)
        varianz_all[i]=varianz
        
    return varianz_all
   

In [57]:
k = 100 #number of clusters
varianz_all = cluster_var( myKMeans_predict,k)
print("Mean:", varianz_all.mean())

Maximal Distnaz: [1.073698 1.177225 0.970017 1.169471]
Mean: 0.0008540035815727044


In [50]:
varianz_all

array([[2.98099720e-05, 1.52409942e-05, 3.23343335e-05, 4.55347780e-05],
       [2.58657311e-05, 3.81001375e-05, 3.17685050e-05, 3.68916205e-05],
       [3.24191733e-05, 1.01083108e-04, 1.15564968e-03, 6.70342794e-04],
       [9.79454088e-05, 2.88233672e-03, 7.75491726e-05, 1.09113459e-03],
       [4.26730144e-04, 5.65772256e-04, 5.38177156e-04, 7.41113684e-04],
       [5.21220643e-05, 6.11646025e-05, 1.50749783e-04, 1.58588326e-04],
       [7.50498925e-05, 6.89430832e-05, 1.84640793e-05, 6.32435757e-05],
       [7.39545302e-05, 7.53325987e-05, 7.35051851e-05, 1.00692202e-04],
       [3.54828005e-05, 2.96488860e-05, 3.34282993e-05, 2.82741623e-05],
       [3.33723406e-04, 1.68632439e-04, 1.11548578e-04, 1.54874626e-04],
       [6.30020285e-05, 6.92299336e-05, 4.20658839e-05, 3.46955857e-05],
       [5.62487404e-04, 2.49411159e-04, 2.19681029e-04, 3.43084085e-04],
       [3.70740873e-05, 2.25164895e-05, 6.16018577e-05, 6.61068340e-05],
       [3.19939507e-05, 3.37366089e-05, 3.48942240e

In [29]:
myKMeans_predict[cluster_number]

95

In [None]:
[coordinates.iloc[i]['pickup_latitude'], coordinates.iloc[i]['pickup_longitude']]