# Neighbourhood Clustering
Now that we have collected all the data required, we can attempt to cluster the hexes in our grid into groups with similar characteristics. The attributes we will use to cluster hexes are:
- Population Density
- Real Estate Costs
- Venue Density
- Venue Category

In [1]:
import folium
import geopandas as gpd
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from tqdm.notebook import tqdm

## Data Consolidation
All the data needed for this project has been collected from different sources. We now need to combine these into one or two datasets for further exploration.

In [2]:
# Load all required datasets
df_hex = gpd.read_feather('../data/bangalore_hex_costs.feather') # Hex locations and real estate prices
df_venues = pd.read_feather('../data/bangalore_foursquare_data.feather') # Venue Data

# Display shapes of all dataframes
print('Shape of df_hex: {}'.format(df_hex.shape))
print('Shape of df_venues: {}'.format(df_venues.shape))

# Display columns of all dataframes as a table
cols_hex = df_hex.columns.to_series().reset_index(drop=True)
cols_venues = df_venues.columns.to_series().reset_index(drop=True)

pd.DataFrame({
        'df_hex': cols_hex,
        'df_venues': cols_venues,
}).fillna('')

Shape of df_hex: (942, 11)
Shape of df_venues: (19985, 7)


Unnamed: 0,df_hex,df_venues
0,id,venue_id
1,hex_id,name
2,ward_no,lat
3,centre_lat,lon
4,centre_lon,address
5,resolution,category
6,pop_total,hex_id
7,ward_name,
8,geometry,
9,address,


We will use one-hot encoding to split and convert the category column into usable data for clustering. Then group by hex ID, and average the cost and category columns - this will give us a way to build a profile of each hex.

In [3]:
df_onehot = (
    pd.get_dummies(df_venues, columns = ['category'], prefix = '', prefix_sep = '')
    .drop(columns = ['name', 'lat', 'lon'])
    .groupby('hex_id')
    .sum()
)

df_onehot.describe()

Unnamed: 0,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,Clothing & Jewelry,Coffee & Tea,...,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
count,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,...,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0
mean,0.064994,0.423033,0.343216,0.722919,0.59293,0.807298,0.651083,0.142531,0.631699,0.949829,...,0.41049,1.873432,1.302166,0.437856,0.242873,1.584949,0.09236,0.567845,0.030787,0.147092
std,0.264522,0.779338,0.649633,1.021274,1.139613,1.166716,0.981118,0.51372,1.364102,1.397892,...,0.780925,2.317038,1.629118,0.750847,0.529137,2.067061,0.36627,0.958746,0.185578,0.432716
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,3.0,2.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0
max,2.0,5.0,4.0,7.0,9.0,7.0,5.0,7.0,13.0,10.0,...,6.0,14.0,9.0,4.0,3.0,13.0,4.0,7.0,2.0,4.0


In [4]:
print(df_onehot.shape)
df_onehot.head()

(877, 34)


Unnamed: 0_level_0,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,Clothing & Jewelry,Coffee & Tea,...,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
hex_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8860145101fffff,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8860145105fffff,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,0,0,0,0,0
8860145107fffff,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
886014510dfffff,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8860145111fffff,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


Now, we need to map the population density for each hex into this table.

In [5]:
# Merge with one-hot encoded dataset and display

df_cluster = df_hex[['id', 'pop_total', 'cost_sqft']].merge(
    df_onehot,
    left_on = 'id',
    right_index = True,
    how='inner',
).set_index('id', drop = True)

# View table
print(df_cluster.shape)
pd.options.display.max_colwidth = 20
df_cluster.head()

(877, 36)


Unnamed: 0_level_0,pop_total,cost_sqft,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,...,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8861892db3fffff,1413.220043,4896.586595,0,1,0,2,2,0,0,0,...,0,4,1,0,0,2,0,2,0,0
886016975dfffff,1762.379434,4970.4214,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8860169759fffff,1786.718829,4877.673619,0,0,0,0,1,0,0,0,...,0,1,2,0,1,0,0,0,0,0
8860169645fffff,2635.117082,5134.151371,0,2,3,1,3,0,0,1,...,0,0,1,0,0,9,1,0,0,0
886016962dfffff,1853.947643,4827.231325,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# K-Means Clustering

This data is now ready to use for clustering the locations into (hopefully) distinct segments. Before that, the data should be scaled to ensure consistent distance measurements. The MaxAbsScaler is used in this case.

In [6]:
scaler = MaxAbsScaler()
cluster_data = scaler.fit_transform(df_cluster)

print('Input data shape: {}'.format(cluster_data.shape)) # Check array size is correct

Input data shape: (877, 36)


In [7]:
pd.DataFrame(cluster_data, columns = df_cluster.columns).describe()

Unnamed: 0,pop_total,cost_sqft,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,...,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
count,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,...,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0,877.0
mean,0.122115,0.439715,0.032497,0.084607,0.085804,0.103274,0.065881,0.115328,0.130217,0.020362,...,0.068415,0.133817,0.144685,0.109464,0.080958,0.121919,0.02309,0.081121,0.015393,0.036773
std,0.151771,0.107068,0.132261,0.155868,0.162408,0.145896,0.126624,0.166674,0.196224,0.073389,...,0.130154,0.165503,0.181013,0.187712,0.176379,0.159005,0.091568,0.136964,0.092789,0.108179
min,0.005117,0.281397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.028074,0.367463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.049522,0.407094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.071429,0.111111,0.0,0.0,0.076923,0.0,0.0,0.0,0.0
75%,0.165546,0.487066,0.0,0.2,0.25,0.142857,0.111111,0.142857,0.2,0.0,...,0.166667,0.214286,0.222222,0.25,0.0,0.153846,0.0,0.142857,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


The selection of the number of clusters is crucial in this algorithm. Models with a range of k-values are trained and evaluated using two metrics - the elbow method and the Silhoutte Score. This process has been repeated for multiple iterations, to add an additional evaulation parameter - consistency of results.

In initial tests, there was a great deal of variation in results on the exact same dataset in between iterations. Some degree of variance is expected with K-Means, however too much variation indicates that the clusters formed are likely influenced more by the random starting centroids than any actual innate property of the data.

In [8]:
k_range = list(range(2, 10)) # No of clusters ranges from 2 to n-1
iterations = 3 # To check for consistency
inertia_list = []
sil_score_list = []
cluster_count_list = []
for i in tqdm(range(iterations)):
    inertia = []
    sil_score = []
    cluster_count = []
    for k in k_range:
        kmeans = KMeans(n_clusters = k, n_init = 100, max_iter = 500)
        kmeans.fit_predict(cluster_data)
        inertia.append(kmeans.inertia_)
        sil_score.append(silhouette_score(cluster_data, kmeans.labels_))
        # Points per cluster
        labels, counts = np.unique(kmeans.labels_, return_counts = True)
        cluster_count.append(sorted(counts, reverse = True))
    inertia_list.append(inertia)
    sil_score_list.append(sil_score)
    cluster_count_list.append(cluster_count)

print('Table of cluster counts')
pd.options.display.max_colwidth = 200
pd.DataFrame({
    'Iteration 1': cluster_count_list[0],
    'Iteration 2': cluster_count_list[1],
    'Iteration 3': cluster_count_list[2],
    },
    index = k_range,
)

  0%|          | 0/3 [00:00<?, ?it/s]

Table of cluster counts


Unnamed: 0,Iteration 1,Iteration 2,Iteration 3
2,"[463, 414]","[463, 414]","[462, 415]"
3,"[429, 253, 195]","[431, 257, 189]","[432, 255, 190]"
4,"[432, 163, 151, 131]","[431, 163, 154, 129]","[434, 157, 150, 136]"
5,"[422, 125, 120, 117, 93]","[422, 128, 114, 111, 102]","[418, 127, 126, 111, 95]"
6,"[351, 154, 124, 100, 88, 60]","[397, 152, 109, 99, 77, 43]","[398, 132, 131, 104, 76, 36]"
7,"[363, 147, 106, 85, 65, 62, 49]","[347, 157, 102, 94, 64, 64, 49]","[343, 149, 109, 89, 69, 63, 55]"
8,"[344, 138, 94, 76, 73, 59, 54, 39]","[337, 149, 88, 79, 64, 55, 53, 52]","[341, 145, 98, 85, 60, 59, 49, 40]"
9,"[347, 131, 98, 62, 61, 48, 46, 46, 38]","[341, 138, 83, 70, 64, 55, 50, 45, 31]","[346, 152, 72, 68, 64, 59, 48, 35, 33]"


In [9]:
# Create 3 sub-plots with secondary y-axis
fig = make_subplots(
    rows = 3,
    cols = 1,
    subplot_titles = ['Iteration 1', 'Iteration 2', 'Iteration 3'],
    specs = [
        [{'secondary_y': True}],
        [{'secondary_y': True}],
        [{'secondary_y': True}],
    ],
)

for i, inertia, sil_score in zip(range(iterations), inertia_list, sil_score_list):
    # Create plots
    inertia_plot = go.Scatter(x = k_range, y = inertia, name = 'Inertia')
    sil_score_plot = go.Scatter(x = k_range, y = sil_score, name = 'Sil Score {}'.format(i+1))
    # Add to figure
    fig.add_trace(inertia_plot, row = i+1, col = 1, secondary_y = False)
    fig.add_trace(sil_score_plot, row = i+1, col = 1, secondary_y = True)
    # Set x-axis title
    fig.update_xaxes(title_text='Number of Clusters (k)', dtick = 1, row = i+1, col = 1)
    # Set y-axes titles
    fig.update_yaxes(title_text='Inertia', showgrid = False, secondary_y=False, row = i+1, col = 1)
    fig.update_yaxes(title_text='Silhouette Score', showgrid = False, secondary_y=True, row = i+1, col = 1)

# Update figure
fig.update_layout(
    title_text='k vs Inertia/Silhouette Score',
    template = 'plotly',
    width = 900,
    height = 900,
    legend = dict(
        orientation = 'v',
        xanchor = 'left',
        yanchor = 'bottom',
        x = 1,
        y = 1,
    )
)
    
fig.show()

With these parameters, the graphs indicate that a k-value of around 4 or 5 might be suitable in this case. The final model will be built using this parameter and used to cluster the hexes in our set.

In [10]:
kmeans = KMeans(n_clusters = 4, n_init = 100, max_iter = 500)
kmeans.fit_predict(cluster_data)

df_kmeans = df_hex[['id', 'address', 'geometry']].merge(
    df_cluster.assign(cluster = kmeans.labels_),
    how = 'inner',
    left_on='id',
    right_index = True
)

pd.options.display.max_colwidth = 20
df_kmeans.insert(1, 'cluster', df_kmeans.pop('cluster'))
print(df_kmeans.shape)
df_kmeans.head()

(877, 40)


Unnamed: 0,id,cluster,address,geometry,pop_total,cost_sqft,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,...,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
0,8861892db3fffff,1,"Yelahanka, Kempe...",POLYGON ((77.613...,1413.220043,4896.586595,0,1,0,2,...,0,4,1,0,0,2,0,2,0,0
1,886016975dfffff,1,"Kempegowda, Yela...",POLYGON ((77.613...,1762.379434,4970.4214,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8860169759fffff,1,"Kempegowda, Yela...",POLYGON ((77.609...,1786.718829,4877.673619,0,0,0,0,...,0,1,2,0,1,0,0,0,0,0
3,8860169645fffff,2,"Bellary Road, Am...",POLYGON ((77.600...,2635.117082,5134.151371,0,2,3,1,...,0,0,1,0,0,9,1,0,0,0
4,886016962dfffff,1,Chowdeswari Ward...,POLYGON ((77.592...,1853.947643,4827.231325,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
cluster_counts = df_kmeans['cluster'].value_counts()
df_kmeans_clusters = df_kmeans.groupby('cluster').mean()
df_kmeans_clusters.insert(0, 'count', cluster_counts)

pd.options.display.max_columns = 50 # Display all clusters
df_kmeans_clusters

Unnamed: 0_level_0,count,pop_total,cost_sqft,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,Clothing & Jewelry,Coffee & Tea,College & University,Electronics Store,Factory,Fast Food,Gas Station,Groceries,Indian Restaurant,Medical Center,Medical Store,Movie Theater,Nightlife Spot,Office,Outdoors & Recreation,Professional & Other Places,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
0,148,6475.118764,5775.596147,0.114865,0.594595,0.466216,1.594595,0.885135,1.533784,1.263514,0.121622,0.682432,1.216216,0.871622,0.310811,0.175676,0.614865,0.364865,2.466216,2.648649,1.351351,0.797297,0.128378,0.621622,1.614865,1.175676,1.756757,0.804054,4.621622,1.851351,1.114865,0.72973,2.222973,0.060811,0.97973,0.040541,0.148649
1,431,3355.542976,5313.282752,0.020882,0.174014,0.060325,0.324826,0.187935,0.192575,0.157773,0.034803,0.113689,0.194896,0.554524,0.062645,0.290023,0.099768,0.12993,0.345708,0.584687,0.183295,0.051044,0.046404,0.132251,0.359629,0.338747,0.547564,0.12065,1.155452,0.299304,0.106729,0.113689,0.361949,0.00232,0.269142,0.006961,0.030162
2,131,10810.336688,6217.98983,0.068702,0.564885,1.076336,0.870229,0.725191,2.10687,0.824427,0.198473,2.145038,2.160305,0.854962,0.725191,0.091603,1.80916,0.320611,1.946565,5.145038,1.099237,0.396947,0.473282,1.610687,1.908397,0.763359,1.183206,0.687023,1.618321,3.114504,0.954198,0.160305,3.763359,0.389313,0.641221,0.076336,0.328244
3,167,11565.954175,6574.304974,0.131737,0.802395,0.389222,0.862275,1.275449,0.730539,1.245509,0.39521,0.736527,1.712575,1.658683,0.48503,0.209581,0.682635,0.461078,1.0,3.305389,1.443114,0.221557,0.221557,0.832335,4.832335,1.101796,2.898204,0.592814,1.491018,1.982036,0.287425,0.209581,2.467066,0.11976,0.916168,0.047904,0.305389


## Cluster Visualization
Next, we need a profile for each cluster - a way to visualize what venue categories are most prominent in each. We will again scale the data to achieve this. This way, we can show the clusters top venue categories relative to others.

In [12]:
cluster_scaler = StandardScaler()
scaled_clusters = cluster_scaler.fit_transform(df_kmeans_clusters.drop(columns = ['pop_total', 'count', 'cost_sqft']))
df_kmeans_profiles = pd.DataFrame(scaled_clusters, columns = df_kmeans_clusters.columns[3:])
df_kmeans_profiles

Unnamed: 0,ATM,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Automotive Shop,Bakery & Dessert,Bank,Cafeteria,Clothing & Jewelry,Coffee & Tea,College & University,Electronics Store,Factory,Fast Food,Gas Station,Groceries,Indian Restaurant,Medical Center,Medical Store,Movie Theater,Nightlife Spot,Office,Outdoors & Recreation,Professional & Other Places,Quick Bites,Residence,Restaurant,Salon,School,Shop & Service,Shopping Mall,Spiritual Center,Travel & Transport,Vegetarian / Vegan Restaurant
0,0.714139,0.266971,-0.086662,1.510115,0.298972,0.535188,0.870838,-0.494954,-0.316623,-0.14336,-0.277122,-0.351373,-0.225431,-0.299337,0.379753,1.249345,-0.167058,0.665814,1.554164,-0.55572,-0.333269,-0.344133,0.999334,0.185399,0.974409,1.719298,0.039478,1.167762,1.715853,0.015764,-0.555234,0.992715,-0.096957,-0.44845
1,-1.463676,-1.585198,-1.192511,-1.303059,-1.487059,-1.292008,-1.593718,-1.146972,-1.076476,-1.540708,-1.052538,-1.375934,1.381144,-1.125008,-1.570607,-1.331272,-1.43335,-1.67596,-1.139373,-1.067418,-1.251572,-1.110113,-1.529139,-1.212911,-1.658512,-0.763757,-1.509624,-1.191197,-0.763102,-1.517317,-0.950126,-1.543235,-1.456513,-1.424079
2,-0.355558,0.136137,1.575606,-0.094718,-0.11076,1.315933,-0.107829,0.082209,1.63745,1.148321,-0.317862,1.35941,-1.406646,1.61505,0.012367,0.616939,1.364539,0.160363,0.109132,1.597223,1.522708,-0.165011,-0.24633,-0.477855,0.523528,-0.432173,1.300233,0.791816,-0.575518,1.284709,1.662604,-0.215355,1.352293,1.03036
3,1.105096,1.182091,-0.296433,-0.112339,1.298847,-0.559113,0.830708,1.559717,-0.244351,0.535747,1.647522,0.367897,0.250934,-0.190706,1.178486,-0.535012,0.235869,0.849784,-0.523922,0.025914,0.062134,1.619257,0.776135,1.505367,0.160575,-0.523368,0.169914,-0.768381,-0.377233,0.216844,-0.157244,0.765875,0.201176,0.842169


In [13]:
# Calculate Subplot Grid
num_clusters = len(df_kmeans_clusters.index)
cols = 2
rows = (num_clusters//cols)
rows += 1 if num_clusters % cols else 0

# Prepare Subplot Titles
subplot_titles = []
for i in df_kmeans_clusters.index:
    title = 'Cluster {} [Hexes: {:d} | Population {:.0f} | Cost: {:.0f}]'.format(
        i,
        df_kmeans_clusters.loc[i, 'count'],
        df_kmeans_clusters.loc[i, 'pop_total'],
        df_kmeans_clusters.loc[i, 'cost_sqft'],
    )
    subplot_titles.append(title)

In [14]:
fig = make_subplots(
    rows = rows,
    cols = cols,
    subplot_titles = subplot_titles,
)

cmap = px.colors.qualitative.G10[0:num_clusters] # Color map to use

fig.update_layout(
    title = dict(text = 'Cluster Profiles', x = 0.08, xanchor = 'left',  font_size = 24),
    width = 1000,
    height = 800,
    showlegend = False,
    template = 'plotly',
    colorway = cmap,
)

# Starting row, column
r = 1
c = 1


for i in df_kmeans_clusters.index:
    data = df_kmeans_profiles.loc[i].sort_values(ascending = False)[0:10]
    trace = go.Bar(
        x = data.index,
        y = data.values,
        orientation = 'v',
        opacity = 0.8,
    )
    fig.add_trace(trace, row = r, col = c)
    if c < cols:
        c += 1
    else:
        r += 1 # If last column increment row
        c = 1

fig.update_annotations(font_size = 14)

note = go.layout.Annotation(
    text = '<i>'
        + '<br>Population from 2011 census'
        + '<br>Cost in Rs./sq. ft.'
        + '<br>Higher values for venues indicate higher frequency <br>relative to other clusters.'
        + '</i>',
    font = dict(size = 12),
    align = 'left',
    xref = 'paper',
    yref = 'paper',
    xanchor = 'right',
    yanchor = 'top',
    x = 1.14,
    y = 1.16,
    bordercolor = 'black',
    borderwidth = 1,
    borderpad = 2,
    showarrow = False,
)

fig.add_annotation(note)

fig.show()

In [15]:
df_kmeans = gpd.GeoDataFrame(df_kmeans, geometry = df_kmeans.geometry).set_crs(epsg=4326)

hex_clusters = df_kmeans.set_index('id')['cluster'] # To lookup cluster of each hex

cmap_folium = folium.StepColormap(cmap,vmin=0,vmax=num_clusters)
cmap_folium.caption = 'Cluster Labels'

map_centre = (12.9792,77.5916)
m = folium.Map(location = map_centre, zoom_start = 11)

def style_func(feature):
    cluster = hex_clusters.get(feature['properties']['id'])
    color = cmap[cluster]
    opacity = 0.8
    style = dict(
    fillColor = color,
    fillOpacity = opacity,
    color = '#000000',
    weight = 1,
    opacity = 1.0,
    )
    return style

folium.GeoJson(
    data = df_kmeans,
    name = 'kmeans_clusters',
    style_function = style_func,
    popup = folium.GeoJsonPopup(
        fields = ['id', 'cluster', 'address'],
        aliases = ['Hex ID', 'Cluster', 'Address'],
        style = ('max-width: 500px; overflow: hidden'),
        ),
).add_to(m)

m.add_child(cmap_folium)

m # Display map

In [16]:
df_kmeans.to_feather('../data/bangalore_clustered.feather') # Save to disk


this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.


