In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

In [None]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.display.float_format = lambda x: '%.2f' % x

### Reading the dataset

Fuente: https://www.datos.gov.co/Organismos-de-Control/Poblaci-n-Sistema-Subsidio-Familiar-Marzo-2018/gn7q-qsmp

In [None]:
subsidio_df = pd.read_csv('./data/subsidio_familiar_mar_2018.csv')

In [None]:
subsidio_df.shape

In [None]:
subsidio_df.dtypes

In [None]:
subsidio_df.head()

In [None]:
X = subsidio_df[subsidio_df.columns.tolist()[2:]]

In [None]:
X.head()

### Training a K-Means model

In [None]:
n_clusters = 4

In [None]:
model = KMeans(n_clusters = n_clusters)

In [None]:
clusters = model.fit_predict(X)

In [None]:
subsidio_df[ 'cluster' ] = clusters

In [None]:
subsidio_df.head()

### Evaluating the model

In [None]:
subsidio_df['cluster'].value_counts()

In [None]:
silhouette_avg = silhouette_score(X, clusters)
silhouette_avg

In [None]:
fig, (ax1) = plt.subplots(1, 1)
fig.set_size_inches(12, 8)

ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 20])

sample_silhouette_values = silhouette_samples(X, clusters)

y_lower = 20
for i in range(n_clusters):
    ith_cluster_silhouette_values = sample_silhouette_values[clusters == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.get_cmap('Spectral')(float(i) / n_clusters)
    ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor = color, edgecolor = color, alpha = 0.7)

    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    y_lower = y_upper + 20

ax1.set_title('The silhouette plot for the various clusters')
ax1.set_xlabel('The silhouette coefficient values')
ax1.set_ylabel('Cluster label')

ax1.axvline(x = silhouette_avg, color = 'red', linestyle = '--')

ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

plt.show()

### Cluster explanations

In [None]:
subsidio_df.groupby( 'cluster' ).describe(percentiles = [.5])[subsidio_df.columns.tolist()[2:-1]]