# K-means (K-Medias)

In [1]:
import pandas as pd
import numpy as np

In [62]:
census = pd.read_csv('../../datasets/census.csv')
census.head()

Unnamed: 0,CensusId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001,Alabama,Autauga,55221,26745,28476,2.6,75.8,18.5,0.4,...,0.5,1.3,1.8,26.5,23986,73.6,20.9,5.5,0.0,7.6
1,1003,Alabama,Baldwin,195121,95314,99807,4.5,83.1,9.5,0.6,...,1.0,1.4,3.9,26.4,85953,81.5,12.3,5.8,0.4,7.5
2,1005,Alabama,Barbour,26932,14497,12435,4.6,46.2,46.7,0.2,...,1.8,1.5,1.6,24.1,8597,71.8,20.8,7.3,0.1,17.6
3,1007,Alabama,Bibb,22604,12073,10531,2.2,74.5,21.4,0.4,...,0.6,1.5,0.7,28.8,8294,76.8,16.1,6.7,0.4,8.3
4,1009,Alabama,Blount,57710,28512,29198,8.6,87.9,1.5,0.3,...,0.9,0.4,2.3,34.9,22189,82.0,13.5,4.2,0.4,7.7


In [66]:
# verificar si hacen falta datos
census.isna().sum().sum()

3

In [64]:
# solo hay un dato faltnte en 3 columnas por lo qe podemos eliminarlos sin afectar el modelo
census_missing = census.dropna()

In [65]:
census_missing.isna().sum().sum()

0

In [67]:
# depuramos columnas que no tienen valores relevantes para el modelo
census_columns = [col for col in census.columns.values if col not in ['CensusId', 'State', 'County']]
census_columns[:5]

['TotalPop', 'Men', 'Women', 'Hispanic', 'White']

In [51]:
# importamos Kmeans de sklearn y asignamos 4 clusters
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)

In [68]:
# entrenamos el modelo
census_clusters = kmeans.fit(census_missing[census_columns])
census_clusters

KMeans(n_clusters=4)

In [69]:
# asignamos un grupo a cada observación y luego agregamos este dato al dataset
census_miss = census_missing.copy()
census_miss['Cluster'] = census_clusters.fit_predict(census_miss[census_columns])

In [70]:
# revisamos el recuento de condados en cada grupo
census_miss.Cluster.value_counts()

2    3040
0     158
1      19
3       1
Name: Cluster, dtype: int64

In [72]:
# observaciones cluster 0
census_miss[census_miss.Cluster==0].State.value_counts().head()

California      15
Florida         12
New Jersey      11
Pennsylvania     9
Texas            9
Name: State, dtype: int64

In [73]:
# observaciones cluster 1
census_miss[census_miss.Cluster==1].State.value_counts().head()

California    5
Texas         4
New York      3
Florida       2
Nevada        1
Name: State, dtype: int64

In [74]:
# observaciones cluster 2
census_miss[census_miss.Cluster==2].State.value_counts().head()

Texas       240
Georgia     155
Virginia    130
Kentucky    119
Missouri    112
Name: State, dtype: int64

In [75]:
# observaciones cluster 3
census_miss[census_miss.Cluster==3].State.value_counts().head()

California    1
Name: State, dtype: int64

In [76]:
# ingreso medio para cada uno de los 4 clusters
census_miss.groupby(['Cluster'])['Income'].mean()

Cluster
0    61204.303797
1    58589.894737
2    45258.590132
3    56196.000000
Name: Income, dtype: float64

In [77]:
# tasa media de pobreza infantil para cada uno de los 4 clusters
census_miss.groupby(['Cluster'])['ChildPoverty'].mean()

Cluster
0    19.975316
1    23.584211
2    24.410526
3    25.800000
Name: ChildPoverty, dtype: float64

# Hierarchical Clustering (Agrupamiento jerarquico)

In [81]:
# tomamos una muestra del dataset de census para generar un dendrograma claro y ordenado
from sklearn.cluster import AgglomerativeClustering
census_sample = census_missing[census_columns].sample(n=1000)
hier_clust = AgglomerativeClustering(linkage='ward')
census_hier = hier_clust.fit(census_sample)

In [83]:
# para plottear un dendograma utilizamos Scipy
from scipy.cluster.hierarchy import dendrogram
import matplotlib.pyplot as plt

In [86]:
# función para plotter un dendrogram
def plot_dendrogram(model, **kwargs):
    # Children of hierarchical clustering
    children = model.children_
    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])
    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)
    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)
    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
# generamos el dendogram del modelo
plot_dendrogram(census_hier, labels=census_hier.labels_)