# Neighbourhood Clustering
Now that we have collected all the data required, we can attempt to cluster the hexes in our grid into groups with similar characteristics. The attributes we will use to cluster hexes are:
- Population Density
- Real Estate Costs
- Venue Density
- Venue Category

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import folium

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from tqdm.notebook import tqdm

## Data Consolidation
All the data needed for this project has been collected from different sources. We now need to combine these into one or two datasets for further exploration.

In [2]:
# Load all required datasets
df_wards = gpd.read_file('../data/BBMP_wards.geojson') # Ward-level population data
df_hex = gpd.read_feather('../data/bangalore_hex_costs.feather') # Hex locations and real estate prices
df_venues = pd.read_feather('../data/bangalore_foursquare_data.feather') # Venue Data

# Display shapes of all dataframes
print('Shape of df_wards: {}'.format(df_wards.shape))
print('Shape of df_hex: {}'.format(df_hex.shape))
print('Shape of df_venues: {}'.format(df_venues.shape))

# Display columns of all dataframes as a table
cols_wards = df_wards.columns.to_series().reset_index(drop=True)
cols_hex = df_hex.columns.to_series().reset_index(drop=True)
cols_venues = df_venues.columns.to_series().reset_index(drop=True)

pd.DataFrame({
        'df_wards': cols_wards,
        'df_hex': cols_hex,
        'df_venues': cols_venues,
}).fillna('')

Shape of df_wards: (198, 7)
Shape of df_hex: (942, 8)
Shape of df_venues: (19985, 7)


Unnamed: 0,df_wards,df_hex,df_venues
0,ward_no,id,venue_id
1,ward_name,ward_no,name
2,pop_total,centre_lat,lat
3,area_sq_km,centre_lon,lon
4,lat,resolution,address
5,lon,geometry,category
6,geometry,address,hex_id
7,,cost_sqft,


We will use one-hot encoding to split and convert the category column into usable data for clustering. Then group by hex ID, and average the cost and category columns - this will give us a way to build a profile of each hex.

In [3]:
df_cluster = (
    pd.get_dummies(df_venues, columns = ['category'], prefix = '', prefix_sep = '')
    .drop(columns = ['name', 'lat', 'lon'])
    .groupby('hex_id')
    .sum()
)

df_cluster.describe()

Unnamed: 0,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,Clothing & Jewelry,Coffee & Tea,...,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
count,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,...,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0
mean,0.065068,0.423516,0.343607,0.723744,0.593607,0.808219,0.651826,0.142694,0.63242,0.950913,...,0.410959,1.875571,1.303653,0.438356,0.243151,1.586758,0.092466,0.568493,0.030822,0.14726
std,0.268947,0.775242,0.660367,1.015956,1.14409,1.16902,0.988394,0.505019,1.381361,1.399138,...,0.769456,2.31996,1.612532,0.75568,0.529376,2.049225,0.366466,0.956715,0.185681,0.432935
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,3.0,2.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0
max,2.0,5.0,4.0,7.0,9.0,7.0,6.0,7.0,13.0,10.0,...,6.0,16.0,9.0,4.0,3.0,13.0,4.0,7.0,2.0,4.0


In [4]:
print(df_cluster.shape)
df_cluster.head()

(876, 34)


Unnamed: 0_level_0,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,Clothing & Jewelry,Coffee & Tea,...,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
hex_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8860145101fffff,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8860145105fffff,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8860145107fffff,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
886014510dfffff,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8860145111fffff,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


Now, we need to map the population density for each hex into this table.

In [5]:
hex_pop_density = pd.merge(
    df_hex[['id', 'ward_no', 'address', 'cost_sqft', 'geometry']],
    df_wards[['ward_no', 'pop_total', 'area_sq_km']],
    left_on = 'ward_no',
    right_on = 'ward_no',
    how = 'left'
)
hex_pop_density['pop_density'] = hex_pop_density['pop_total'] / hex_pop_density['area_sq_km']
hex_pop_density.head()

Unnamed: 0,id,ward_no,address,cost_sqft,geometry,pop_total,area_sq_km,pop_density
0,8860169665fffff,1,"Yelahanka, Kempegowda, Yelahanka Zone, Bengalu...",5084.65027,"POLYGON ((77.60526 13.11063, 77.60535 13.11561...",21866,10.47,2088.443171
1,886016975bfffff,1,"Canadian International School, 4, Bellary Road...",4844.626311,"POLYGON ((77.60117 13.11812, 77.60126 13.12311...",21866,10.47,2088.443171
2,8860169751fffff,1,"Kempegowda, Yelahanka Zone, Bengaluru, Bangalo...",5018.27293,"POLYGON ((77.60552 13.12559, 77.60560 13.13058...",21866,10.47,2088.443171
3,8860169669fffff,1,"Yelahanka Sante, Shivanahalli Main Road, Gandh...",5135.624541,"POLYGON ((77.60501 13.09567, 77.60510 13.10065...",21866,10.47,2088.443171
4,8860169759fffff,1,"Kempegowda, Yelahanka Zone, Bengaluru, Bangalo...",4877.673619,"POLYGON ((77.60961 13.11810, 77.60969 13.12308...",21866,10.47,2088.443171


In [6]:
# Merge with one-hot encoded dataset and display

df_cluster = df_cluster.merge(
    hex_pop_density[['id', 'address','pop_density', 'cost_sqft', 'geometry']],
    left_index = True,
    right_on = 'id',
    how = 'left'
).reset_index(drop = True)

# Rearrange columns
df_cluster.insert(0, 'id', df_cluster.pop('id'))
df_cluster.insert(1, 'geometry', df_cluster.pop('geometry'))
df_cluster.insert(2, 'address', df_cluster.pop('address'))
df_cluster.insert(3, 'pop_density', df_cluster.pop('pop_density'))
df_cluster.insert(4, 'cost_sqft', df_cluster.pop('cost_sqft'))

df_cluster.set_index('id', drop = True)

# View table
print(df_cluster.shape)
pd.options.display.max_colwidth = 20
df_cluster.head()

(876, 39)


Unnamed: 0,id,geometry,address,pop_density,cost_sqft,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,...,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
0,8860145101fffff,POLYGON ((77.517...,"Ganakkal, Hemmig...",850.332284,4740.392847,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8860145105fffff,POLYGON ((77.521...,"NICE Expressway,...",2051.277533,4564.352886,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,8860145107fffff,POLYGON ((77.513...,"Ganakkal, Hemmig...",850.332284,4449.431938,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,886014510dfffff,POLYGON ((77.525...,Banashankari 6th...,2429.922136,4756.632313,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8860145111fffff,POLYGON ((77.496...,"Hemmigepura, Hem...",850.332284,4281.226169,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# Clustering

This data is now ready to use for clustering the locations into (hopefully) distinct segments. Before that, the data should be scaled to ensure consistent distance measurements. The MaxAbsScaler is used in this case.

In [7]:
scaler = MaxAbsScaler()
cluster_data = scaler.fit_transform(df_cluster.drop(columns = ['id', 'address', 'geometry']))

print('Input data shape: {}'.format(cluster_data.shape)) # Check array size is correct

Input data shape: (876, 36)


In [8]:
pd.DataFrame(cluster_data, columns = df_cluster.columns[3:]).describe()

Unnamed: 0,pop_density,cost_sqft,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,...,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
count,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,...,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0
mean,0.076789,0.439794,0.032534,0.084703,0.085902,0.103392,0.065956,0.11546,0.108638,0.020385,...,0.068493,0.117223,0.14485,0.109589,0.08105,0.122058,0.023116,0.081213,0.015411,0.036815
std,0.106958,0.107111,0.134474,0.155048,0.165092,0.145137,0.127121,0.167003,0.164732,0.072146,...,0.128243,0.144997,0.17917,0.18892,0.176459,0.157633,0.091617,0.136674,0.092841,0.108234
min,9e-06,0.281397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.016339,0.367416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0292,0.407142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0625,0.111111,0.0,0.0,0.076923,0.0,0.0,0.0,0.0
75%,0.097299,0.487123,0.0,0.2,0.25,0.142857,0.111111,0.142857,0.166667,0.0,...,0.166667,0.1875,0.222222,0.25,0.0,0.153846,0.0,0.142857,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## K Means
The K-Means algorithm is a good starting point for this purpose. The variables must be properly scaled first since K-Means assumes all the clusters have similar variance and a spherical distribution.

The selection of the number of clusters is crucial in this algorithm. Models with a range of k-values are trained and evaluated using two metrics - the elbow method and the Silhoutte Score. This process has been repeated for multiple iterations, to add an additional evaulation parameter - consistency of results.

In initial tests, there was a great deal of variation in results on the exact same dataset in between iterations. Some degree of variance is expected with K-Means, however too much variation indicates that the clusters formed are likely influenced more by the random starting centroids than any actual innate property of the data.

In [9]:
k_range = list(range(2, 10)) # No of clusters ranges from 2 to n-1
iterations = 3 # To check for consistency
inertia_list = []
sil_score_list = []
cluster_count_list = []
for i in tqdm(range(iterations)):
    inertia = []
    sil_score = []
    cluster_count = []
    for k in k_range:
        kmeans = KMeans(n_clusters = k, n_init = 100, max_iter = 500)
        kmeans.fit_predict(cluster_data)
        inertia.append(kmeans.inertia_)
        sil_score.append(silhouette_score(cluster_data, kmeans.labels_))
        # Points per cluster
        labels, counts = np.unique(kmeans.labels_, return_counts = True)
        cluster_count.append(sorted(counts, reverse = True))
    inertia_list.append(inertia)
    sil_score_list.append(sil_score)
    cluster_count_list.append(cluster_count)

print('Table of cluster counts')
pd.options.display.max_colwidth = 200
pd.DataFrame({
    'Iteration 1': cluster_count_list[0],
    'Iteration 2': cluster_count_list[1],
    'Iteration 3': cluster_count_list[2],
    },
    index = k_range,
)

  0%|          | 0/3 [00:00<?, ?it/s]

Table of cluster counts


Unnamed: 0,Iteration 1,Iteration 2,Iteration 3
2,"[460, 416]","[460, 416]","[460, 416]"
3,"[429, 273, 174]","[432, 264, 180]","[432, 265, 179]"
4,"[436, 180, 139, 121]","[435, 168, 141, 132]","[435, 168, 144, 129]"
5,"[439, 155, 123, 105, 54]","[430, 159, 131, 115, 41]","[438, 140, 140, 104, 54]"
6,"[421, 185, 89, 79, 58, 44]","[430, 132, 103, 101, 72, 38]","[418, 124, 121, 105, 65, 43]"
7,"[369, 154, 102, 77, 68, 54, 52]","[351, 156, 120, 86, 70, 50, 43]","[378, 140, 115, 82, 69, 51, 41]"
8,"[355, 134, 102, 89, 69, 47, 40, 40]","[401, 137, 87, 63, 61, 51, 40, 36]","[347, 141, 97, 68, 63, 62, 52, 46]"
9,"[381, 133, 64, 62, 59, 56, 44, 39, 38]","[375, 117, 83, 63, 60, 53, 45, 41, 39]","[341, 139, 86, 76, 72, 54, 42, 39, 27]"


In [10]:
# Create 3 sub-plots with secondary y-axis
fig = make_subplots(
    rows = 3,
    cols = 1,
    subplot_titles = ['Iteration 1', 'Iteration 2', 'Iteration 3'],
    specs = [
        [{'secondary_y': True}],
        [{'secondary_y': True}],
        [{'secondary_y': True}],
    ],
)

for i, inertia, sil_score in zip(range(iterations), inertia_list, sil_score_list):
    # Create plots
    inertia_plot = go.Scatter(x = k_range, y = inertia, name = 'Inertia')
    sil_score_plot = go.Scatter(x = k_range, y = sil_score, name = 'Sil Score {}'.format(i+1))
    # Add to figure
    fig.add_trace(inertia_plot, row = i+1, col = 1, secondary_y = False)
    fig.add_trace(sil_score_plot, row = i+1, col = 1, secondary_y = True)
    # Set x-axis title
    fig.update_xaxes(title_text='Number of Clusters (k)', dtick = 1, row = i+1, col = 1)
    # Set y-axes titles
    fig.update_yaxes(title_text='Inertia', showgrid = False, secondary_y=False, row = i+1, col = 1)
    fig.update_yaxes(title_text='Silhouette Score', showgrid = False, secondary_y=True, row = i+1, col = 1)

# Update figure
fig.update_layout(
    title_text='k vs Inertia/Silhouette Score',
    template = 'plotly',
    width = 900,
    height = 900,
    legend = dict(
        orientation = 'v',
        xanchor = 'left',
        yanchor = 'bottom',
        x = 1,
        y = 1,
    )
)
    
fig.show()

With these parameters, the graphs indicate that a k-value of around 5 or 6 might be suitable in this case. The final model will be built using this parameter and used to cluster the hexes in our set.

In [11]:
kmeans = KMeans(n_clusters = 5, n_init = 100, max_iter = 500)
kmeans.fit_predict(cluster_data)

df_kmeans = df_cluster.assign(cluster = kmeans.labels_)
df_kmeans.insert(1, 'cluster', df_kmeans.pop('cluster'))

cluster_counts = df_kmeans['cluster'].value_counts()
df_kmeans_clusters = df_kmeans.groupby('cluster').mean()
df_kmeans_clusters.insert(0, 'count', cluster_counts)

pd.options.display.max_columns = 50 # Display all clusters
df_kmeans_clusters

Unnamed: 0_level_0,count,pop_density,cost_sqft,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,Clothing & Jewelry,Coffee & Tea,College & University,Electronics Store,Factory,Fast Food,Gas Station,Groceries,Indian Restaurant,Medical Center,Medical Store,Movie Theater,Nightlife Spot,Office,Outdoors & Recreation,Professional & Other Places,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
0,118,14352.05488,6334.466858,0.050847,0.525424,1.144068,0.847458,0.737288,2.279661,0.974576,0.161017,2.186441,2.042373,0.805085,0.754237,0.084746,1.932203,0.313559,2.025424,5.161017,1.279661,0.449153,0.483051,1.601695,1.584746,0.855932,1.135593,0.70339,1.59322,3.09322,0.957627,0.169492,3.440678,0.381356,0.694915,0.059322,0.279661
1,437,4664.683646,5310.274772,0.01373,0.162471,0.064073,0.329519,0.199085,0.210526,0.17849,0.034325,0.10984,0.187643,0.553776,0.064073,0.292906,0.112128,0.130435,0.354691,0.599542,0.187643,0.05492,0.045767,0.125858,0.354691,0.340961,0.583524,0.130435,1.183066,0.306636,0.105263,0.114416,0.363844,0.002288,0.279176,0.009153,0.032037
2,45,5843.017452,5720.667813,0.066667,0.377778,0.622222,1.222222,0.8,0.688889,0.555556,1.311111,0.422222,2.777778,0.422222,0.4,0.133333,0.888889,0.244444,1.044444,4.222222,0.6,0.066667,0.155556,0.955556,10.866667,0.755556,2.844444,0.533333,2.288889,2.777778,0.466667,0.2,1.6,0.155556,0.288889,0.044444,0.222222
3,138,9567.792159,5824.685805,0.137681,0.565217,0.442029,1.550725,0.826087,1.485507,1.253623,0.101449,0.601449,1.246377,0.847826,0.311594,0.210145,0.623188,0.355072,2.543478,2.652174,1.362319,0.818841,0.137681,0.565217,1.630435,1.23913,1.702899,0.818841,4.623188,1.804348,1.130435,0.775362,2.34058,0.050725,1.007246,0.036232,0.144928
4,138,16851.953601,6689.341375,0.166667,1.036232,0.355072,0.876812,1.42029,0.804348,1.304348,0.130435,1.057971,1.543478,2.057971,0.514493,0.181159,0.594203,0.543478,1.043478,3.210145,1.565217,0.26087,0.253623,0.971014,2.862319,1.07971,2.775362,0.601449,1.427536,1.949275,0.347826,0.195652,3.115942,0.152174,1.028986,0.065217,0.376812


## Cluster Visualization
Next, we need a profile for each cluster - a way to visualize what venue categories are most prominent in each. We will again scale the data to achieve this. This way, we can show the clusters top venue categories relative to others.

In [12]:
cluster_scaler = StandardScaler()
scaled_clusters = cluster_scaler.fit_transform(df_kmeans_clusters.drop(columns = ['pop_density', 'count', 'cost_sqft']))
df_kmeans_profiles = pd.DataFrame(scaled_clusters, columns = df_kmeans_clusters.columns[3:])
df_kmeans_profiles

Unnamed: 0,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,Clothing & Jewelry,Coffee & Tea,College & University,Electronics Store,Factory,Fast Food,Gas Station,Groceries,Indian Restaurant,Medical Center,Medical Store,Movie Theater,Nightlife Spot,Office,Outdoors & Recreation,Professional & Other Places,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
0,-0.641101,-0.027747,1.727743,-0.288539,-0.152971,1.648445,0.282311,-0.386008,1.811342,0.561471,-0.227293,1.517823,-1.356073,1.820674,-0.028172,0.798496,1.288606,0.540419,0.419404,1.794137,1.54941,-0.495067,0.005448,-0.754778,0.622706,-0.501929,1.140846,0.928446,-0.497717,1.146016,1.784201,0.106654,0.832156,0.584516
1,-1.297164,-1.286478,-1.288792,-1.55621,-1.542225,-1.22779,-1.571128,-0.648016,-1.058107,-1.59529,-0.659075,-1.515399,1.593194,-1.18615,-1.37218,-1.342459,-1.662174,-1.562047,-0.969299,-1.134205,-1.468003,-0.81984,-1.670331,-1.374141,-1.822682,-0.828725,-1.731018,-1.294118,-0.723342,-1.633792,-1.119309,-1.157581,-1.706002,-1.527583
2,-0.361491,-0.539788,0.270175,0.628711,0.008906,-0.562834,-0.693248,1.992476,-0.626456,1.416632,-0.885101,-0.039025,-0.667672,0.097084,-0.535428,-0.458577,0.681308,-0.768135,-0.92792,-0.398986,0.22835,1.955657,-0.321189,1.162371,-0.103099,0.052354,0.81575,-0.351748,-0.372734,-0.51697,0.054658,-1.128045,0.079462,0.094595
3,0.893714,0.110259,-0.233123,1.432732,0.076243,0.544518,0.931984,-0.509198,-0.3788,-0.364148,-0.153858,-0.427562,0.420613,-0.341862,0.276506,1.462356,-0.334343,0.699561,1.721647,-0.518685,-0.569714,-0.483003,1.252422,-0.118322,1.115453,1.91224,-0.187471,1.379047,1.984338,0.152117,-0.748306,1.056432,-0.336032,-0.564686
4,1.406042,1.743754,-0.476002,-0.216694,1.610047,-0.402339,1.050081,-0.449254,0.252022,-0.018665,1.925327,0.464163,0.009939,-0.389747,1.659274,-0.459815,0.026603,1.090202,-0.243832,0.257739,0.259956,-0.157747,0.733649,1.084869,0.187621,-0.63394,-0.038108,-0.661628,-0.390545,0.852629,0.028756,1.12254,1.130416,1.413157


In [13]:
# Calculate Subplot Grid
num_clusters = len(df_kmeans_clusters.index)
cols = 2
rows = (num_clusters//cols)
rows += 1 if num_clusters % cols else 0

# Prepare Subplot Titles
subplot_titles = []
for i in df_kmeans_clusters.index:
    title = 'Cluster {} [Hexes: {:d} | Population {:.0f} | Cost: {:.0f}]'.format(
        i,
        df_kmeans_clusters.loc[i, 'count'],
        df_kmeans_clusters.loc[i, 'pop_density'],
        df_kmeans_clusters.loc[i, 'cost_sqft'],
    )
    subplot_titles.append(title)

In [14]:
fig = make_subplots(
    rows = rows,
    cols = cols,
    subplot_titles = subplot_titles,
)

cmap = px.colors.qualitative.G10[0:num_clusters] # Color map to use

fig.update_layout(
    title = dict(text = 'Cluster Profiles', x = 0.08, xanchor = 'left',  font_size = 24),
    width = 1000,
    height = 1200,
    showlegend = False,
    template = 'plotly',
    colorway = cmap,
)

# Starting row, column
r = 1
c = 1


for i in df_kmeans_clusters.index:
    data = df_kmeans_profiles.loc[i].sort_values(ascending = False)[0:10]
    trace = go.Bar(
        x = data.index,
        y = data.values,
        orientation = 'v',
        opacity = 0.8,
    )
    fig.add_trace(trace, row = r, col = c)
    if c < cols:
        c += 1
    else:
        r += 1 # If last column increment row
        c = 1

fig.update_annotations(font_size = 14)

note = go.layout.Annotation(
    text = '<i>'
        + '<br>Population in persons/ sq. km'
        + '<br>Cost in Rs./sq. ft.'
        + '<br>Higher values indicate higher frequency <br>relative to other clusters.'
        + '</i>',
    font = dict(size = 12),
    align = 'left',
    xref = 'paper',
    yref = 'paper',
    xanchor = 'right',
    yanchor = 'top',
    x = 1.1,
    y = 1.1,
    bordercolor = 'black',
    borderwidth = 1,
    borderpad = 2,
    showarrow = False,
)

fig.add_annotation(note)

fig.show()

In [15]:
df_kmeans = gpd.GeoDataFrame(df_kmeans, geometry = df_kmeans.geometry).set_crs(epsg=4326)

hex_clusters = df_kmeans.set_index('id')['cluster'] # To lookup cluster of each hex

cmap_folium = folium.StepColormap(cmap,vmin=0,vmax=num_clusters)
cmap_folium.caption = 'Cluster Labels'

map_centre = (12.9792,77.5916)
m = folium.Map(location = map_centre, zoom_start = 11)

def style_func(feature):
    cluster = hex_clusters.get(feature['properties']['id'])
    color = cmap[cluster]
    opacity = 0.8
    style = dict(
    fillColor = color,
    fillOpacity = opacity,
    color = '#000000',
    weight = 1,
    opacity = 1.0,
    )
    return style

folium.GeoJson(
    data = df_kmeans,
    name = 'kmeans_clusters',
    style_function = style_func,
    popup = folium.GeoJsonPopup(
        fields = ['id', 'cluster', 'address'],
        aliases = ['Hex ID', 'Cluster', 'Address'],
        style = ('max-width: 500px; overflow: hidden'),
        ),
).add_to(m)

m.add_child(cmap_folium)

m # Display map

In [16]:
df_kmeans.to_feather('../data/bangalore_clustered.feather') # Save to disk


this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.


