In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium import FeatureGroup

import ast

import sys

import os

%matplotlib inline


## Read data

In [3]:
#kmeans cluster data
kmeans_cluster_df = pd.read_csv('data/kmean_clusters_withFlags_8PCA_upto3miles_selectedVariable.csv')
kmeans_cluster_df.head()

Unnamed: 0.1,Unnamed: 0,tripDuration,tripDistance_miles,StartTime_AMPeak,StartTime_Day,StartTime_PMPeak,StartTime_Night,origin_emp_density_perMile,destination_emp_density_perMile,destination_pop_density_perMile,...,kmean_cluster_10,kmean_cluster_11,kmean_cluster_12,kmean_cluster_13,kmean_cluster_14,kmean_cluster_15,kmean_cluster_16,kmean_cluster_17,kmean_cluster_18,kmean_cluster_19
0,0,4.863267,0.659896,0,0,0,1,66028.995549,4651.938119,345.957386,...,0,4,4,4,6,6,6,6,4,4
1,1,2.301083,0.239849,0,0,0,1,171759.500659,17769.861925,9298.818799,...,4,2,11,1,2,2,2,2,15,15
2,2,1.5266,0.079536,0,0,0,1,76697.996907,317589.557663,4883.546206,...,6,7,8,11,8,8,8,16,14,14
3,3,5.6644,0.259733,0,0,0,1,93878.933828,93878.933828,5061.155631,...,5,4,0,4,11,11,11,11,12,12
4,4,6.711383,0.449873,0,0,0,1,130038.135042,47729.808467,0.0,...,5,4,0,4,11,11,11,11,12,12


In [5]:
#route data
route_df = pd.read_csv('data/MarchMonthData_withRouteData.csv')
route_df = route_df.loc[:,['trip_id','startLatitude','startLongitude',
                          'endLatitude','endLongitude','tripRoute']]
route_df.head()


Unnamed: 0,trip_id,startLatitude,startLongitude,endLatitude,endLongitude,tripRoute
0,255573,36.16011,-86.7784,36.16386,-86.76912,"[[36.16011,-86.7784],[36.16004,-86.77838],[36...."
1,255633,36.13976,-86.80105,36.13824,-86.79793,"[[36.13976,-86.80105],[36.13975,-86.80098],[36..."
2,255634,36.16331,-86.77609,36.16342,-86.77631,"[[36.16331,-86.77609],[36.16338,-86.77616],[36..."
3,255635,36.15596,-86.78063,36.15565,-86.78042,"[[36.15596,-86.78063],[36.15596,-86.78049],[36..."
4,255636,36.16141,-86.77571,36.15677,-86.77593,"[[36.16141,-86.77571],[36.16128,-86.77566],[36..."


# Plot maps of clusters

In [6]:
a=len(kmeans_cluster_df[kmeans_cluster_df['kmean_cluster_10']==6]) 
a if a<500 else 500

500

In [10]:
def plot_clusters(cluster_name, cluster_df, trip_df, suffix):
    ####This function takes the cluster number along with files to plot sample 500 trips of each clusters in individual folders
    ####INPUT:
    ####1. string of column name ('kmean_cluster_7')
    ####2. pandas dataframe of clusters
    ####3. pandas dataframe of trip data
    ####OUTPUT: None; saves in the folder
    
    #extract just cluster name and trip_id
    n_cluster_df = cluster_df.loc[:,['trip_id',cluster_name]]

    #merge cluster number with trip route dataset
    data = pd.merge(n_cluster_df,route_df, on=['trip_id'])
    data.head()

    #create a folder if it doesn't exist
    if not os.path.exists(str('results/'+cluster_name+suffix)):
        os.makedirs(str('results/'+cluster_name+suffix))


    
    for i in data[cluster_name].unique():
            #fix sample size less length of cluster if it is less than 500
        sample_size= len(data[data[cluster_name]==i]) if len(data[data[cluster_name]==i])<500 else 500
        
        data_plot = data[data[cluster_name]==i].sample(sample_size,replace=False,random_state=0)

        #remove route variable
        data_save = data_plot[['trip_id',cluster_name,'startLatitude', 'startLongitude', 'endLatitude','endLongitude']]

        #data_save.to_csv(str(cluster_name+suffix+'/plot_data_sampled_'+cluster_name+'_'+str(i)+str('.csv')))

        folium_map_route = folium.Map(location=[36.165096,-86.778367],
                                zoom_start=14)#,
                                #tiles="CartoDB dark_matter")
            
        feature_start = FeatureGroup(name='Start point')
        feature_end = FeatureGroup(name='End point')
        feature_route = FeatureGroup(name='Route')
        abs#start point
        data_plot.apply(lambda x: folium.Circle(location=tuple((x['startLatitude'],x['startLongitude'])),
                                                radius=0.5, 
                                                fill=True,
                                                color='red').add_to(feature_start), axis=1)


        #end point
        data_plot.apply(lambda x: folium.Circle(location=tuple((x['endLatitude'],x['endLongitude'])),
                                                radius=0.5, 
                                                fill=True,
                                                color='blue').add_to(feature_end), axis=1)

        #trip plot
        data_plot.apply(lambda x: folium.PolyLine(ast.literal_eval(x['tripRoute']), 
                                             color='#008080', 
                                             weight=0.5, 
                                             opacity=0.7).add_to(feature_route), axis=1)



        folium_map_route.add_child(feature_route)
        folium_map_route.add_child(feature_start)
        folium_map_route.add_child(feature_end)
        

        # turn on layer control
        folium_map_route.add_child(folium.map.LayerControl())                                 

        folium_map_route.save(str('results/'+cluster_name+suffix+'/plot_data_sampled_'+cluster_name+'_'+str(i)+str('.html')))
        #folium_map_route.save(str(cluster_name+suffix+'/plot_data_sampled_'+cluster_name+'_'+str(i)+str('.png')))



In [11]:
#call plot function to plot kmean cluster
plot_clusters('kmean_cluster_14',kmeans_cluster_df,route_df,'_withFlags_8PCA_selectedVariable_')