In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples, jaccard_score
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import dendrogram
from unsupervised_helper_functions import *

# Ejercicio 1

In [None]:
df = pd.read_csv('data/usa_arrests.csv')
df.columns = ['state'] + list(df.columns.str.lower())[1:]
df = df.set_index('state')
df.head()

Calculamos los componentes principales para poder graficar los clusters:

In [None]:
pca_model = PCA(n_components=2).fit(df)
var = pca_model.explained_variance_ratio_
pca = pca_model.transform(df)

labels = [f'PC{i+1} ({var[i] * 100:.2f}%)' for i in np.arange(2)]
pca = pd.DataFrame(pca, columns=labels, index=df.index)

Graficamos las distribuciones de todas las features. A primera vista podemos ver que las variables tienen diferentes escalas, lo que podría generar problemas con el algoritmo, ya que depende de las distancias.

In [None]:
figure = px.box(df, orientation='h')
figure.update_layout(
    title='Distributions',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    plot_bgcolor='white'
)
figure.show()

Normalizamos las variables y miramos las siguientes distribuciones:

In [None]:
df_scaled = StandardScaler().fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)

In [None]:
figure = px.box(df_scaled, orientation='h')
figure.update_layout(
    title='Distributions',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    plot_bgcolor='white'
)
figure.show()

In [None]:
# Create a 2x2 subplot grid
figure = make_subplots(rows=2, cols=2, subplot_titles=df.columns)

# Add each density plot to the respective subplot
for (i, c) in enumerate(df.columns):
    row = i // 2 + 1
    col_pos = i % 2 + 1
    density = ff.create_distplot([df_scaled[c]], group_labels=[c], show_hist=False)
    for trace in density['data']:
        figure.add_trace(trace, row=row, col=col_pos)

# Update layout
figure.update_layout(
    title_text='Density Distributions',
    showlegend=False,
    height=500,
    width=1200,
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    plot_bgcolor='white',
)

# Update x-axis and y-axis for all subplots
for i in range(1, 5):
    figure.update_xaxes(row=(i+1)//2, col=(i-1)%2 + 1,
                        showgrid=True, gridcolor='LightGray',
                        showline=True, linecolor='Black',
                        zeroline=True, zerolinecolor='LightGray')
    figure.update_yaxes(row=(i+1)//2, col=(i-1)%2 + 1,
                        showgrid=True, gridcolor='LightGray',
                        showline=True, linecolor='Black',
                        zeroline=True, zerolinecolor='LightGray')

figure.show()


In [None]:
pca_scaled = PCA(n_components=2).fit(df_scaled)
var = pca_scaled.explained_variance_ratio_
pca_scaled = pca_scaled.transform(df_scaled)

labels = [f'PC{i+1} ({var[i] * 100:.2f}%)' for i in np.arange(2)]
pca_scaled = pd.DataFrame(pca_scaled, columns=labels, index=df_scaled.index)

## Modelo

Ajustamos un modelo aglomerativo sin escalar los datos. Para poder graficarlos facilmente en 2D, utilizamos componentes principales.

In [None]:
tree = AgglomerativeClustering(n_clusters=2, linkage='complete')
y = tree.fit_predict(df)

In [None]:
figure = px.scatter(
    pca,
    x=pca.columns[0],
    y=pca.columns[1],
    color=y.astype(str),
    color_discrete_map={'0': '#E65983', '1': '#2D3846'},
    size=[1] * pca.shape[0],
)
figure.update_layout(
    title='Clusters con datos no estandarizados',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    xaxis_title=pca.columns[0],
    yaxis_title=pca.columns[1],
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
)
figure.show()

Si ajustamos el mismo modelo con los datos estandarizados, obtenemos:

In [None]:
tree_scaled = AgglomerativeClustering(n_clusters=2, linkage='complete')
y_scaled = tree_scaled.fit_predict(df_scaled)

In [None]:
figure = px.scatter(
    pca_scaled,
    x=pca_scaled.columns[0],
    y=pca_scaled.columns[1],
    color=y_scaled.astype(str),
    color_discrete_map={'0': '#E65983', '1': '#2D3846'},
    size=[1] * pca.shape[0],
)
figure.update_layout(
    title='Clusters con datos estandarizados',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    xaxis_title=pca_scaled.columns[0],
    yaxis_title=pca_scaled.columns[1],
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
)
figure.show()

Podemos ver que los resultados encontrados en ambos casos difieren. Es por eso que, para algoritmos que depedenden de las distancias de las observaciones, como los algoritmos jerárquicos, es mejor normalizar las variables primero.

Ya habiendo normalizado los datos, buscamos el valor de $K$ óptimo (i.e., la cantidad óptima de clusters para nuestro problema). Para hacer esto, ajustamos varios modelos con distintos valores de $K$. El valor óptimo es aquel a partir del cual la reducción en la dispersión total intra-grupo comienza a ser más lenta.

In [None]:
scores = []
for k in np.arange(2, 11):
    tree = AgglomerativeClustering(n_clusters=k, linkage='complete')
    labels = tree.fit_predict(df_scaled)
    score = silhouette_score(df_scaled, labels)
    scores.append(score)

figure = px.line(
    x=np.arange(2, 11),
    y=scores
)
figure.update_layout(
    title='Elbow Method',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    xaxis_title='K',
    yaxis_title='Silhouette Score',
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
)
figure.show()

Ajustamos el modelo con el número óptimo de clusters: $k=4$.

In [None]:
tree = AgglomerativeClustering(n_clusters=4, linkage='complete')
y = tree.fit_predict(df_scaled)

In [None]:
figure = px.scatter(
    pca_scaled,
    x=pca_scaled.columns[0],
    y=pca_scaled.columns[1],
    color=y.astype(str),
    color_discrete_map={'0': '#E65983', '1': '#4FDFEF', '2': '#3D8791', '3': '#2D3846'},
    size=[1] * pca.shape[0],
)
figure.update_layout(
    title='Clusters con datos estandarizados',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    xaxis_title=pca_scaled.columns[0],
    yaxis_title=pca_scaled.columns[1],
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
)
figure.show()

Miramos los índices de Silhouette para cada cluster para tener una idea de la cohesión y dispersión de los clusters.

In [None]:
silhouette_values = silhouette_samples(df_scaled, y)

n_clusters = len(np.unique(y))
y_lower, y_upper = 0, 0
yticks = []
colors = ['#E65983', '#4FDFEF', '#3D8791', '#2D3846']

fig, ax = plt.subplots(figsize=(10, 8))
for i in np.arange(n_clusters):
    cluster_silhouette_vals = silhouette_values[y == i]
    cluster_silhouette_vals.sort()
    y_upper += len(cluster_silhouette_vals)
    ax.barh(np.arange(y_lower, y_upper), cluster_silhouette_vals, edgecolor='white', height=1, color=colors[i])
    yticks.append((y_lower + y_upper) / 2)
    y_lower += len(cluster_silhouette_vals)

ax.axvline(x=np.mean(silhouette_values), color="black", linestyle="--")
ax.set_yticks(yticks, [f'Cluster {i}' for i in np.arange(n_clusters)])
ax.set_xlabel('Silhouette Coefficient')
ax.set_title('Silhouette Plot', loc='left', fontdict={'fontsize': 16, 'fontweight': 'bold'})

Descartamos ahora a la variable `UrbanPop` del análisis. Primero buscamos el número óptimo de clusters para el problema.

In [None]:
df_scaled_new = df_scaled.loc[:, df_scaled.columns != 'urbanpop']

scores = []
for k in np.arange(2, 11):
    tree = AgglomerativeClustering(n_clusters=k, linkage='complete')
    labels = tree.fit_predict(df_scaled_new)
    score = silhouette_score(df_scaled_new, labels)
    scores.append(score)

figure = px.line(
    x=np.arange(2, 11),
    y=scores
)
figure.update_layout(
    title='Elbow Method',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    xaxis_title='K',
    yaxis_title='Silhouette Score',
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
)
figure.show()

In [None]:
pca_scaled_new = PCA(n_components=2).fit(df_scaled_new)
pca_scaled_new = pca_scaled_new.transform(df_scaled_new)
pca_scaled_new = pd.DataFrame(pca_scaled_new, columns=['PC1', 'PC2'], index=df_scaled_new.index)

In [None]:
tree_new = AgglomerativeClustering(n_clusters=4, linkage='complete')
y_new = tree_new.fit_predict(df_scaled_new)

In [None]:
figure = px.scatter(
    pca_scaled_new,
    x='PC1',
    y='PC2',
    color=y_new.astype(str),
    color_discrete_map={'0': '#E65983', '1': '#4FDFEF', '2': '#3D8791', '3': '#2D3846'},
    size=[1] * pca_scaled_new.shape[0],
)
figure.update_layout(
    title='Clusters con datos estandarizados',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    xaxis_title='PC1',
    yaxis_title='PC2',
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
)
figure.show()

Miramos los índices de Silhouette para cada nuevo cluster para tener una idea de su cohesión y dispersión.

In [None]:
silhouette_values = silhouette_samples(df_scaled_new, y)

n_clusters = len(np.unique(y))
y_lower, y_upper = 0, 0
yticks = []
colors = ['#E65983', '#4FDFEF', '#3D8791', '#2D3846']

fig, ax = plt.subplots(figsize=(10, 8))
for i in np.arange(n_clusters):
    cluster_silhouette_vals = silhouette_values[y == i]
    cluster_silhouette_vals.sort()
    y_upper += len(cluster_silhouette_vals)
    ax.barh(np.arange(y_lower, y_upper), cluster_silhouette_vals, edgecolor='white', height=1, color=colors[i])
    yticks.append((y_lower + y_upper) / 2)
    y_lower += len(cluster_silhouette_vals)

ax.axvline(x=np.mean(silhouette_values), color="black", linestyle="--")
ax.set_yticks(yticks, [f'Cluster {i}' for i in np.arange(n_clusters)])
ax.set_xlabel('Silhouette Coefficient')
ax.set_title('Silhouette Plot', loc='left', fontdict={'fontsize': 16, 'fontweight': 'bold'})

# Ejercicio 2

Utilizar un método de cluster jerárquico para el conjunto de datos `mall_costumers.csv`. Las descripción de las variables y el contexto del problema se pueden encontrar en [Kaggle](https://www.kaggle.com/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python). Probar el método divisivo DIANA.

In [None]:
df = pd.read_csv('data/mall_costumers.csv')
df.head()

Miramos primero que no hayana datos nulos. En caso de encontrar datos nulos, tenemos que buscar algún metodo de imputación.

In [None]:
df.isnull().sum()

Para la variable categórica `Gender`, vamos a utilizar el método 'binary encoding' para convertirla en una variable numérica.

In [None]:
df['Gender'] = (df['Gender']  == 'Male').astype(int)
df.head()

Miramos las distribuciones de las variables. Vamos a tener que normalizar los datos.

In [None]:
figure = px.box(df.iloc[:, 1:], orientation='h')
figure.update_layout(
    title='Distributions',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    plot_bgcolor='white'
)
figure.show()

In [None]:
df_scaled = StandardScaler().fit_transform(df.iloc[:, 1:])
df_scaled = pd.DataFrame(df_scaled, columns=df.columns[1:], index=df.index)

In [None]:
# Create a 2x2 subplot grid
figure = make_subplots(rows=2, cols=2, subplot_titles=df_scaled.columns)

# Add each density plot to the respective subplot
for (i, c) in enumerate(df_scaled.columns):
    row = i // 2 + 1
    col_pos = i % 2 + 1
    density = ff.create_distplot([df_scaled[c]], group_labels=[c], show_hist=False)
    for trace in density['data']:
        figure.add_trace(trace, row=row, col=col_pos)

# Update layout
figure.update_layout(
    title_text='Density Distributions',
    showlegend=False,
    height=500,
    width=1200,
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    plot_bgcolor='white',
)

# Update x-axis and y-axis for all subplots
for i in range(1, 5):
    figure.update_xaxes(row=(i+1)//2, col=(i-1)%2 + 1,
                        showgrid=True, gridcolor='LightGray',
                        showline=True, linecolor='Black',
                        zeroline=True, zerolinecolor='LightGray')
    figure.update_yaxes(row=(i+1)//2, col=(i-1)%2 + 1,
                        showgrid=True, gridcolor='LightGray',
                        showline=True, linecolor='Black',
                        zeroline=True, zerolinecolor='LightGray')

figure.show()

In [None]:
pca = PCA(n_components=2).fit(df_scaled)
pca = pca.transform(df_scaled)
pca = pd.DataFrame(pca, columns=['PC1', 'PC2'], index=df_scaled.index)

## Modelo

Antes de ajustar el modelo, vamos a calcular la cantidad de clusters óptima para este problema. Para eso utilizamos el método de Mean-Max Silhouette

In [None]:
df_scaled

In [None]:
scores = []
for k in np.arange(2, 11):
    tree = AgglomerativeClustering(n_clusters=k, linkage='complete')
    labels = tree.fit_predict(df_scaled.iloc[:, 1:])
    score = silhouette_score(df_scaled.iloc[:, 1:], labels)
    scores.append(score)

figure = px.line(
    x=np.arange(2, 11),
    y=scores
)
figure.update_layout(
    title='Elbow Method',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    xaxis_title='K',
    yaxis_title='Silhouette Score',
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
)
figure.show()

In [None]:
tree = AgglomerativeClustering(n_clusters=5, linkage='complete')
y = tree.fit_predict(pca)

In [None]:
plot_clusters(data=pca, labels=y, title='Clusters with Agglomerative Clustering')

In [None]:
figure = px.scatter(
    pca,
    x='PC1',
    y='PC2',
    color=y.astype(str),
    color_discrete_map={'0': '#E65983', '1': '#2D3846'},
    size=[1] * pca.shape[0],
)
figure.update_layout(
    title='Clusters con datos estandarizados',
    title_font=dict(size=16, family='Arial', color='black', weight='bold'),
    xaxis_title='PC1',
    yaxis_title='PC2',
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
    xaxis=dict(showgrid=True, gridcolor='LightGray', showline=True, linecolor='Black', zeroline=True, zerolinecolor='LightGray'),
)
figure.show()