In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

In [None]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000

# Carga del dataset

Fuente: https://www.datos.gov.co/Organismos-de-Control/Poblaci-n-Sistema-Subsidio-Familiar-Marzo-2018/gn7q-qsmp

In [None]:
subsidio_mar_2018_df = pd.read_csv( './data/subsidio_familiar_mar_2018.csv' )
subsidio_feb_2018_df = pd.read_csv( './data/subsidio_familiar_mar_2018.csv' )
subsidio_ene_2018_df = pd.read_csv( './data/subsidio_familiar_mar_2018.csv' )
subsidio_dic_2017_df = pd.read_csv( './data/subsidio_familiar_dic_2017.csv' )
subsidio_nov_2017_df = pd.read_csv( './data/subsidio_familiar_nov_2017.csv' )
subsidio_oct_2017_df = pd.read_csv( './data/subsidio_familiar_oct_2017.csv' )
subsidio_sep_2017_df = pd.read_csv( './data/subsidio_familiar_sep_2017.csv' )
subsidio_ago_2017_df = pd.read_csv( './data/subsidio_familiar_ago_2017.csv' )
subsidio_jul_2017_df = pd.read_csv( './data/subsidio_familiar_jul_2017.csv' )
subsidio_jun_2017_df = pd.read_csv( './data/subsidio_familiar_jun_2017.csv' )
subsidio_may_2017_df = pd.read_csv( './data/subsidio_familiar_may_2017.csv' )
subsidio_abr_2017_df = pd.read_csv( './data/subsidio_familiar_abr_2017.csv' )
subsidio_mar_2017_df = pd.read_csv( './data/subsidio_familiar_mar_2017.csv' )
subsidio_feb_2017_df = pd.read_csv( './data/subsidio_familiar_feb_2017.csv' )
subsidio_ene_2017_df = pd.read_csv( './data/subsidio_familiar_ene_2017.csv' )

In [None]:
print( subsidio_mar_2018_df.shape )
print( subsidio_feb_2018_df.shape )
print( subsidio_ene_2018_df.shape )
print( subsidio_dic_2017_df.shape )
print( subsidio_nov_2017_df.shape )
print( subsidio_oct_2017_df.shape )
print( subsidio_sep_2017_df.shape )
print( subsidio_ago_2017_df.shape )
print( subsidio_jul_2017_df.shape )
print( subsidio_jun_2017_df.shape )
print( subsidio_may_2017_df.shape )
print( subsidio_abr_2017_df.shape )
print( subsidio_mar_2017_df.shape )
print( subsidio_feb_2017_df.shape )
print( subsidio_ene_2017_df.shape )

In [None]:
subsidio_mar_2018_df[ 'Periodo' ] = 201803
subsidio_feb_2018_df[ 'Periodo' ] = 201802
subsidio_ene_2018_df[ 'Periodo' ] = 201801
subsidio_dic_2017_df[ 'Periodo' ] = 201712
subsidio_nov_2017_df[ 'Periodo' ] = 201711
subsidio_oct_2017_df[ 'Periodo' ] = 201710
subsidio_sep_2017_df[ 'Periodo' ] = 201709
subsidio_ago_2017_df[ 'Periodo' ] = 201708
subsidio_jul_2017_df[ 'Periodo' ] = 201707
subsidio_jun_2017_df[ 'Periodo' ] = 201706
subsidio_may_2017_df[ 'Periodo' ] = 201705
subsidio_abr_2017_df[ 'Periodo' ] = 201704
subsidio_mar_2017_df[ 'Periodo' ] = 201703
subsidio_feb_2017_df[ 'Periodo' ] = 201702
subsidio_ene_2017_df[ 'Periodo' ] = 201701

In [None]:
subsidio_df = pd.concat( [ subsidio_mar_2018_df, subsidio_feb_2018_df, subsidio_ene_2018_df, subsidio_dic_2017_df, subsidio_nov_2017_df, subsidio_oct_2017_df, subsidio_sep_2017_df, subsidio_ago_2017_df, subsidio_jul_2017_df, subsidio_jun_2017_df, subsidio_may_2017_df, subsidio_abr_2017_df, subsidio_mar_2017_df, subsidio_feb_2017_df, subsidio_ene_2017_df ] )

In [None]:
subsidio_df.shape

In [None]:
subsidio_df.dtypes

In [None]:
subsidio_df.head()

# Extracción de características

In [None]:
X = subsidio_df[ subsidio_df.columns.tolist()[ 2:-1 ] ]

In [None]:
X.head()

# Construcción del modelo de clustering

In [None]:
# Número de clusters
n_clusters = 5

In [None]:
# Se crea el modelo de clustering
model = KMeans( n_clusters = n_clusters )

In [None]:
# Ajuste del modelo a los datos
clusters = model.fit_predict( X )

In [None]:
# Crea una nueva columna en el dataframe con el cluster asignado a cada dato
subsidio_df[ 'cluster' ] = clusters

In [None]:
subsidio_df.head()

# Evaluación del modelo

In [None]:
subsidio_df[ 'cluster' ].value_counts()

In [None]:
# Calcula el valor de silueta promedio de todos los clusters
silhouette_avg = silhouette_score( X, clusters )
silhouette_avg

In [None]:
fig, ( ax1 ) = plt.subplots( 1, 1 )
fig.set_size_inches( 12, 8 )

ax1.set_xlim( [ -0.1, 1 ] )
ax1.set_ylim( [ 0, len( X ) + ( n_clusters + 1 ) * 20 ] )

# Calcula el valor de silueta para cada dato
sample_silhouette_values = silhouette_samples( X, clusters )

y_lower = 20
for i in range( n_clusters ):
    ith_cluster_silhouette_values = sample_silhouette_values[ clusters == i ]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[ 0 ]
    y_upper = y_lower + size_cluster_i

    color = cm.get_cmap("Spectral")( float( i ) / n_clusters )
    ax1.fill_betweenx( np.arange( y_lower, y_upper ), 0, ith_cluster_silhouette_values, facecolor = color, edgecolor = color, alpha = 0.7 )

    ax1.text( -0.05, y_lower + 0.5 * size_cluster_i, str( i ) )

    y_lower = y_upper + 20

ax1.set_title( 'The silhouette plot for the various clusters' )
ax1.set_xlabel( 'The silhouette coefficient values' )
ax1.set_ylabel( 'Cluster label' )

ax1.axvline( x = silhouette_avg, color = 'red', linestyle = '--' )

ax1.set_yticks( [] )
ax1.set_xticks( [ -0.1, 0, 0.2, 0.4, 0.6, 0.8, 1 ] )

plt.show()

# Explicación de los clusters

In [None]:
subsidio_df.loc[ subsidio_df[ 'Código' ] == 22 ]