# Clustering - Population Density


### Imports

In [116]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

In [117]:
pop_df = pd.read_csv('./datasets/Nairobi_Area_Population_Grid.csv')
pop_df = pop_df.drop(columns='Unnamed: 0')

pop_df.head(3)

Unnamed: 0,Lat,Lon,Population
0,-1.050139,36.698194,4.18689
1,-1.050139,36.699306,4.18689
2,-1.050139,36.699583,4.18689


---
### Clean

In [118]:
# try to filter up to 4 or 5 to drop for Population
pop_gt_4 = pop_df[pop_df['Population']>4].copy()

---
### Model

In [119]:
def kmeans(X, n=8):
    
    '''
    Enter a dataframe (X_base) and amount of clusters (n) for kmeans>
    The function will 
        -fit a KMeans model will n clusters,
        -append cluster labels to X_base dataframe
        -create a centroids groupby object for graphing
    
    returns (dataframe with cluster labels, centroid groupby object)
    '''
    
    km = KMeans(n_clusters=n)
    km.fit(X)

    X_df = X
    X_df['clusters'] = km.labels_

    # create centroids for graphing
    centroids = X_df.groupby('clusters')[['Lat', 'Lon']].mean()
    return X_df, centroids, km

In [120]:
# X features from Latitude and Longitude
X = pop_df.drop("Population", axis=1)

# fit kmeans model
km = KMeans(n_clusters=28, random_state=510)
km.fit(X)

pop_df['clusters'] = km.labels_
centroids = pop_df.groupby('clusters')[['Lat', 'Lon']].mean()
    
# ## Plot scatter by cluster / color, and centroids
# colors = ['red', 'green', 'blue', 'purple', 'black', 'yellow', 'orange', 'pink']
# pop_df['color'] = pop_df['clusters'].map(lambda p: colors[p])

# ax = X.plot(    
#     kind="scatter", 
#     x='Lat', y='Lon',
#     figsize=(10,8),
#     c = pop_df['color']
# )

# centroids.plot(
#     kind="scatter", 
#     x="Lat", y="Lon", 
#     marker="*", c=["red", "green", "blue", "purple", "black", "yellow", "orange", "pink"], s=550,
#     ax=ax
# );

In [121]:
silhouette_score(X.head(20_000), km.labels_[:20_000])

0.5935634891460033

In [130]:
# X features from Latitude and Longitude
X = pop_gt_4.drop('Population', axis=1)

# fit kmeans model
km = KMeans(n_clusters=28, random_state=510)
km.fit(X)

pop_gt_4['clusters'] = km.labels_
centroids = pop_gt_4.groupby('clusters')[['Lat', 'Lon']].mean()
    
# ## Plot scatter by cluster / color, and centroids
# colors = [(x/10.0, x/20.0, 0.75) for x in range(28)]
# #pop_gt_4['color'] = pop_gt_4['clusters'].map(lambda p: colors[p])

# ax = X.plot(    
#     kind="scatter", 
#     x='Lat', y='Lon',
#     figsize=(10,8),
#     #c = pop_gt_4['color']
# )

# centroids.plot(
#     kind="scatter", 
#     x="Lat", y="Lon", 
#     marker="*", #c=[(x/10.0, x/20.0, 0.75) for x in range(len(28))], s=550,
#     ax=ax
# );

In [131]:
silhouette_score(X.head(20_000), km.labels_[:20_000])

0.9775180339134651

In [127]:
# see if scaling has an effect -- it shouldn't, but just in case
ss = StandardScaler()
Z = ss.fit_transform(X)

km = KMeans(n_clusters=20, random_state=510)
km.fit(Z)

pop_gt_4_sc = pop_gt_4.copy()
pop_gt_4_sc['clusters'] = km.labels_
centroids = pop_gt_4_sc.groupby('clusters')[['Lat', 'Lon']].mean()

silhouette_score(Z[:20_000], km.labels_[:20_000])

0.6816675879507758

>Scaling has a negative effect

In [135]:
pop_gt_4.groupby('clusters')[['Lat', 'Lon']].mean()

Unnamed: 0_level_0,Lat,Lon
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-1.207695,36.875717
1,-1.162395,36.931332
2,-1.21552,36.73561
3,-1.267139,36.946447
4,-1.090535,37.007059
5,-1.257286,36.881036
6,-1.153336,36.821927
7,-1.372299,36.938266
8,-1.264971,36.683754
9,-1.08061,36.717137


In [136]:
['kasarani',
 'outside',
 'outside',
 'njiru',
 'outside',
 'kasarani']

['kasarani', 'outside', 'outside', 'njiru', 'outside', 'kasarani']

In [140]:
pop_gt_4.groupby('clusters')['Population'].mean().sort_values(ascending=False)

clusters
5     104.363850
17     49.700944
11     42.291724
3      39.948168
13     36.169043
12     33.249166
14     23.824825
0      20.531028
1      16.712763
19     14.295680
6      13.985396
8      12.801498
24     12.224217
2      11.200338
16     11.029175
10     10.966820
26     10.050672
7       9.244917
21      8.836380
25      7.703288
18      7.653544
15      6.789173
4       6.786063
23      6.375374
27      6.290614
22      6.268175
9       5.144413
20      4.889093
Name: Population, dtype: float64