# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [11]:
import pandas as pd
import numpy as np
import folium


In [12]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: False


In [13]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [14]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [15]:
from sklearn.cluster import KMeans

In [16]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [17]:
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [18]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [19]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [20]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [28]:
#TODO nochmal anschauen ... wieso linie der Centers? - nochmal komplett eigene Lösung erstellen
prediction = myKMeans.predict(coordinates.to_numpy())

coordinates.to_numpy()[prediction==5]
def show_cluster(cluster_number, preds, c):
    
    clustCoord = coordinates.to_numpy()[predictions==cluster_number]
    
    entries = np.shape(clustCoord)[0]
    
    print("number of cluster enries:" , entries)
    
    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    
    #plot centers
    folium.PolyLine([ [centers[cluster_number,0],centers[cluster_number,1]] , [centers[cluster_number,2],centers[cluster_number,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)
    
    for i in range(entries):
        folium.CircleMarker([clustCoord[i,0], clustCoord[i,1]], color="blue", radius=1).add_to(cluster_map)
        folium.CircleMarker([clustCoord[i,2], clustCoord[i,3]], color="green", radius=1 ).add_to(cluster_map)
    return cluster_map
    

In [30]:
show_cluster(75,predictions,centers)

number of cluster enries: 3595


## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [36]:
def cluster_var(cluster_number, k):
    npCoord = coordinates.to_numpy()
    
    maxNpCoord = np.max(npCoord, axis=0)
    minNpCoord = np.min(npCoord, axis=0)
    diff = maxNpCoord - minNpCoord
    print ("max diff:", diff)
    for index in range(k):
        clustCoord = npCoord[predictions==index]
        varClustCoord = np.var(clustCoord, axis=0)
        print(i, varClustCoord)

In [37]:
cluster_var(predictions,100)

max diff: [1.073698 1.177225 0.970017 1.169471]
99 [8.34494160e-05 7.99681505e-05 6.42428163e-05 4.47927699e-05]
99 [8.98138833e-05 1.06811405e-04 3.08432310e-04 8.90784525e-05]
99 [0.00039914 0.00058028 0.00050281 0.00073951]
99 [5.84424305e-05 1.30515435e-04 1.83312547e-04 3.14899784e-04]
99 [4.60663458e-05 2.93433135e-05 3.63403726e-05 2.71935533e-05]
99 [3.69436948e-05 3.90024408e-05 2.67382919e-05 2.72477267e-05]
99 [0.00018193 0.00014    0.00015893 0.00011957]
99 [4.73109709e-05 4.21617163e-05 2.98373543e-05 4.40555721e-05]
99 [0.00458897 0.02903146 0.00455274 0.02168158]
99 [0.00034175 0.00016253 0.00016198 0.00014778]
99 [9.68515389e-05 7.90329567e-04 1.18990733e-06 4.81022113e-04]
99 [5.19599553e-05 6.24488196e-05 7.93472152e-05 8.54267188e-05]
99 [3.77938724e-05 2.23349473e-05 6.28028425e-05 6.72324201e-05]
99 [0.00027922 0.00018435 0.00013692 0.00011499]
99 [0.00293911 0.00454012 0.00098063 0.00242336]
99 [2.45257123e-05 2.23012975e-05 2.70837350e-05 2.73259988e-05]
99 [0.00