# Taller de repaso: Aprendizaje no supervisado

## Preparación del notebook

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score
#!pip install kneed
from kneed import KneeLocator

from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA, FactorAnalysis

#!pip install factor_analyzer
from factor_analyzer import FactorAnalyzer

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Lectura del dataset

Este conjunto de datos contiene una encuesta sobre la satisfacción de los pasajeros de una compañía aérea. ¿Qué factores están altamente correlacionados con un pasajero satisfecho (o insatisfecho)?

In [None]:
# Lectura del dataset
df = pd.read_csv('https://raw.githubusercontent.com/mlondono-oc/LEA2/main/Modulo-3/data/train.csv')
df.head()

In [None]:
#Información general de las variables
df.info()

In [None]:
# Eliminación de variables no significativas
df.drop(['Unnamed: 0', 'id', ], axis=1, inplace=True)

In [None]:
# Muestra mas pequeña de datos
df_sample = df.sample(n=2500, random_state=123)
df_sample.reset_index(drop=True, inplace=True)
df_sample.shape

## EDA

In [None]:
# Comprobación de valores nulos
df_sample.isnull().sum()

In [None]:
# Matriz de correlación
plt.figure(figsize=(15,8))
c= df_sample.corr()
sns.heatmap(c, annot=True)

In [None]:
# Eliminación de columna altamente correlacionada
df_sample.drop(['Arrival Delay in Minutes'], axis=1, inplace=True)

In [None]:
# Estilo de las gráficas
sns.set_theme(style='whitegrid', palette='Blues')

# Variable satisfacción
plt.figure(figsize=(8, 6))
ax = sns.countplot(x='satisfaction', data=df_sample)
plt.title('Barplot of Satisfaction (Target)', fontsize=14)
plt.xlabel('Satisfaction (Target)', fontsize=13)
plt.ylabel('Count', fontsize=13)
plt.show()

In [None]:
# Veamos primero la puntuación media de cada clase, en las 14 variables que se encuestaron
eco = df_sample[df_sample['Class']=='Eco'][df_sample.columns[6:20]].mean().mean()
eco_plus = df_sample[df_sample['Class']=='Eco Plus'][df_sample.columns[6:20]].mean().mean()
business = df_sample[df_sample['Class']=='Business'][df_sample.columns[6:20]].mean().mean()
print(eco, eco_plus, business)

In [None]:
df_sample.groupby('Class')[df_sample.columns[6:20]].mean()

In [None]:
# Variable Categoricas
cat_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(14, 10))
axs = axs.ravel()

i=0
for col in cat_cols:

    # define location
    plt.subplot(2, 2, i+1)

    # create plot
    ax = plt.gca()
    axs[i] = sns.countplot(x = col, hue = 'satisfaction', data = df_sample)

    # remove legend
    ax.legend('')

    i += 1

# add legend
axs[1].legend(loc='upper right', fontsize=16)

plt.tight_layout()
plt.show()

In [None]:
# Codificación de variable satisfacción
df_sample['satisfaction'] = df_sample['satisfaction'].map({'neutral or dissatisfied':0, 'satisfied':1})

In [None]:
#  Revisión Clase Eco
eco_proportion = len(df_sample[df_sample['Class']=='Eco'])/len(df_sample)
bad_proportion = len(df_sample[df_sample['Class']=='Eco']['satisfaction']==0)/len(df_sample[df_sample['satisfaction']==0])
print(eco_proportion*100, bad_proportion*100)

## Pre procesamiento de los datos

In [None]:
# Metodología 1
train_df1 = df_sample.copy()
train_df1['Gender'] = train_df1['Gender'].map({'Female':0, 'Male':1})
train_df1['Customer Type'] = train_df1['Customer Type'].map({'disloyal Customer':0, 'Loyal Customer':1})
train_df1['Type of Travel'] = train_df1['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})
train_df1['Class'] = train_df1['Class'].map({'Eco':1, 'Eco Plus':2, 'Business':3})

train_df1.head(3)

In [None]:
# Metodología 2
train_df2 = df_sample.copy()
train_df2['Gender'] = train_df2['Gender'].map({'Female':0, 'Male':1})
train_df2['Customer Type'] = train_df2['Customer Type'].map({'disloyal Customer':0, 'Loyal Customer':1})
train_df2['Type of Travel'] = train_df2['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})

# Calcular la frecuencia de cada categoría Class
frecuencia = train_df2['Class'].value_counts(normalize=True)
# Aplicar la codificación de frecuencia
train_df2['Class'] = train_df2['Class'].map(frecuencia)

train_df2.head(3)

In [None]:
train_df1.shape

In [None]:
train_df1.hist(figsize=(14,10), bins=15)
plt.tight_layout()
plt.show()

In [None]:
# Escalado de variables numéricas
train_cols = train_df1[['Age', 'Flight Distance', 'Departure Delay in Minutes']]

scaler = StandardScaler()
scaled_train_df1 = pd.DataFrame(scaler.fit_transform(train_cols), columns = train_cols.columns)
scaled_train_df1.head(3)

In [None]:
#  Union de data numéricas  y categórica
cols = ['Age', 'Flight Distance', 'Departure Delay in Minutes']
train_df1[cols] = scaled_train_df1
train_df2[cols] = scaled_train_df1
train_df1.head(3)

In [None]:
train_df2.head(3)

In [None]:
# Tratamiento de valores atípicos
train_df1_out = train_df1.copy()

# Sentar las bases de un "mosaico boxplot de 6x4 (para cubrir 22 variables)":
box_locs=pd.DataFrame(
{"vert":sorted(list(range(0,6))*4),
 "horiz":list(range(0,4))*6}
)

fig,axs=plt.subplots(ncols=4,nrows=6,figsize=(20,20))

for idx,i in enumerate(train_df1_out.columns):
    sns.boxplot(y=train_df1_out[i],ax=axs[box_locs.iloc[idx,0]][box_locs.iloc[idx,1]])

In [None]:
train_df1_out.shape

In [None]:
train_df1.shape

## Reducción dimensionalidad

In [None]:
#Cree un dataset reducido que proporcione el 85% de la varianza de entrada que debe explicarse
pca = PCA(n_components=0.85).fit(train_df1_out)

#Veamos cuántas componentes ha conservado PCA:
pca.n_components_

In [None]:
# --- Varianza explicada ---
PCA_variance = pd.DataFrame({'Varianza explicada (%)':
                             pca.explained_variance_ratio_*100})

fig, ax = plt.subplots(1, 1, figsize = (7, 5))

bar = sns.barplot(x = ['PC ' + str(i) for i in range(1, 12)],
                  y = PCA_variance['Varianza explicada (%)'],
                  linewidth = 1.5, edgecolor = 'k', color = '#4bafb8',
                  alpha = 0.8)

plt.show()

In [None]:
PCA_variance

In [None]:
#Por último, transforme "train_df1_out" de 22 columnas X_pca
X_pca = pca.fit_transform(train_df1_out)
X_pca

## Clustering Algorithm: K-Means

### Dataset  Original Escalado

In [None]:
# Número optimo de K
inertia_list = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=3)
    kmeans.fit(train_df1_out)
    inertia_list.append(kmeans.inertia_)

In [None]:
# plot the inertia curve
plt.plot(range(1,11),inertia_list)
plt.scatter(range(1,11),inertia_list)
plt.xlabel("Number of Clusters", size=10)
plt.ylabel("Inertia Value", size=10)
plt.title("Different Inertia Values for Different Number of Clusters", size=12)
plt.show()

In [None]:
# Seleccion automatica del numero k
kl = KneeLocator(range(1, 11),
                 inertia_list, curve="convex",
                 direction="decreasing")
kl.elbow

In [None]:
kmeans_constants = {"init": "k-means++", "n_init": 100, "max_iter": 500, "random_state": 42}

# --- Modelo K-means ---
model_kmeans = KMeans(n_clusters = 4, **kmeans_constants)
model_kmeans.fit(train_df1_out)

**Evaluación del algoritmos**

**Calinski Harabasz Score:**

El índice Calinski-Harabasz se basa en la comparación de la relación ponderada entre la suma de los cuadrados (la medida de la separación del clúster) y la suma de los cuadrados dentro del clúster (la medida de cómo se empaquetan estrechamente los puntos dentro de un clúster).

Las soluciones de clúster con valores más grandes del índice corresponden a soluciones "mejores" que las soluciones de clúster con valores más pequeños.

In [None]:
# --- Evaluación del modelo kmeans ---
print(" ### K-MEANS ###")
print('Inertia: ', model_kmeans.inertia_)
print('Silhouette Score: ', silhouette_score(train_df1_out, model_kmeans.labels_))
print('Calinski harabasz score: ', calinski_harabasz_score(train_df1_out, model_kmeans.labels_))

In [None]:
# Predicción de los clusters
Cluster_1 = model_kmeans.fit_predict(train_df1_out)
Cluster_1

In [None]:
# Vamos a adjuntar 'Cluster_1' como una nueva variable a 'train_df1_out':
df_final = train_df1_out.copy()
df_final['Cluster_1']= Cluster_1

# A partir de ahora, sólo trabajaremos con "df_final"
df_final.head()

In [None]:
df_final.Cluster_1.unique()

### Dataset Reducido

In [None]:
# Número optimo de K
inertia_list = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=3)
    kmeans.fit(X_pca)
    inertia_list.append(kmeans.inertia_)

In [None]:
# plot the inertia curve
plt.plot(range(1,11),inertia_list)
plt.scatter(range(1,11),inertia_list)
plt.xlabel("Number of Clusters", size=10)
plt.ylabel("Inertia Value", size=10)
plt.title("Different Inertia Values for Different Number of Clusters", size=12)
plt.show()

In [None]:
# Seleccion automatica del numero k
kl = KneeLocator(range(1, 11),
                 inertia_list, curve="convex",
                 direction="decreasing")
kl.elbow

In [None]:
kmeans_constants = {"init": "k-means++", "n_init": 100, "max_iter": 500, "random_state": 42}

# --- Modelo K-means ---
model_kmeans_pca = KMeans(n_clusters = 4, **kmeans_constants)
model_kmeans_pca.fit(X_pca)

In [None]:
# --- Evaluación del modelo kmeans ---
print(" ### K-MEANS ###")
print('Inertia: ', model_kmeans_pca.inertia_)
print('Silhouette Score: ', silhouette_score(X_pca, model_kmeans_pca.labels_))
print('Calinski harabasz score: ', calinski_harabasz_score(X_pca, model_kmeans_pca.labels_))

In [None]:
# Predicción de los clusters
Cluster_2 = model_kmeans.fit_predict(X_pca)

df_final['Cluster_2']= Cluster_2

# A partir de ahora, sólo trabajaremos con "df_final"
df_final.head(3)

In [None]:
# 3d scatterplot using matplotlib

fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pca[Cluster_2 == 0,0],X_pca[Cluster_2 == 0,1],X_pca[Cluster_2 == 0,2], s = 40 , color = 'blue', label = "cluster 0")
ax.scatter(X_pca[Cluster_2 == 1,0],X_pca[Cluster_2 == 1,1],X_pca[Cluster_2 == 1,2], s = 40 , color = 'orange', label = "cluster 1")
ax.scatter(X_pca[Cluster_2 == 2,0],X_pca[Cluster_2 == 2,1],X_pca[Cluster_2 == 2,2], s = 40 , color = 'green', label = "cluster 2")
ax.scatter(X_pca[Cluster_2 == 3,0],X_pca[Cluster_2 == 3,1],X_pca[Cluster_2 == 3,2], s = 40 , color = 'purple', label = "cluster 3")
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.legend()
plt.show()

## Clustering Algorithm: Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
# Estilo de las gráficas
sns.set_theme()

# Función de vinculación para agrupar datos según similud
Z = linkage(train_df1_out, method='ward')

plt.figure(figsize=(13, 12))
dendrogram(
    Z,
    orientation='right',
    distance_sort='descending',
    show_leaf_counts=False,
    leaf_font_size=16
)
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

model_hc = AgglomerativeClustering(n_clusters = 4, metric = 'euclidean', linkage = 'ward')
model_hc.fit(train_df1_out)

# --- Evaluación del modelo kmeans ---
print(" ### HIERARCHICAL CLUSTERING ###")
print('Silhouette Score: ', silhouette_score(train_df1_out, model_hc.labels_))
print('Calinski harabasz score: ', calinski_harabasz_score(train_df1_out, model_hc.labels_))

In [None]:
# Predicción de los clusters
Cluster_3 = model_hc.fit_predict(train_df1_out)

df_final['Cluster_3']= Cluster_3

# A partir de ahora, sólo trabajaremos con "df_final"
df_final.head(3)

## Clustering Algorithm: DBSCAN

### Dataset Original

In [None]:
from sklearn.neighbors import NearestNeighbors

# --- Selección de hiperparámetros eps ---

# --- Gráfica de distancias K --
neighbors = NearestNeighbors(n_neighbors = 2)
neighbors.fit(train_df1_out)

distances, *other = neighbors.kneighbors(train_df1_out)
distances = distances[:, 1]
print(distances.shape)
distances = np.sort(distances, axis = 0)

In [None]:
plt.style.use('seaborn')
plt.figure(figsize = (8, 5))
plt.plot(distances, color = 'red')
plt.ylabel('Distancia al K vecino más cercano')
plt.show()

In [None]:
# --- Selección del punto de curvatura máxima ---
kl = KneeLocator(range(len(distances)), distances, curve = 'convex', direction = 'increasing')
kl.elbow

In [None]:
# --- Epsilon óptimo --
epsilon = distances[kl.elbow]
epsilon

In [None]:
# --- Selección de hiperparámetros min_sample ---
silhoutte_coefs = []

for min_points in [5, 8, 10, 15, 20, 30]:
    model_d = DBSCAN(eps = epsilon, min_samples = min_points)
    model_d.fit(train_df1_out)
    silhoutte_coefs.append(silhouette_score(train_df1_out, model_d.labels_))

plt.figure(figsize = (12, 8))
plt.plot([5, 8, 10, 15, 20, 30], silhoutte_coefs, color = 'r', marker = '^')
plt.xlabel('Muestras mínimas de cada cluster')
plt.ylabel('Silhoutte Score')
plt.title('Silhoutte Score - Min Samples de Clusters')
plt.show()

In [None]:
# --- Modelo DBSCAN ---
model_db = DBSCAN(eps = epsilon, min_samples = 5, n_jobs=-1)
model_db.fit(train_df1_out)

In [None]:
# --- Evaluación del modelo DBSCAN ---
print(" ### DBSCAN ###")
print('Silhouette Score: ', silhouette_score(train_df1_out, model_db.labels_))
print('Calinski harabasz score: ', calinski_harabasz_score(train_df1_out, model_db.labels_))

In [None]:
# Predicción de los clusters
Cluster_4 = model_db.fit_predict(train_df1_out)

df_final['Cluster_4']= Cluster_4

# A partir de ahora, sólo trabajaremos con "df_final"
df_final.head(3)

In [None]:
df_final['Cluster_4'].value_counts()

### Dataset reducido

In [None]:
# --- Selección de hiperparámetros eps ---

# --- Gráfica de distancias K --
neighbors = NearestNeighbors(n_neighbors = 2)
neighbors.fit(X_pca)

distances, *other = neighbors.kneighbors(X_pca)
distances = distances[:, 1]
print(distances.shape)
distances = np.sort(distances, axis = 0)
distances

In [None]:
plt.style.use('seaborn')
plt.figure(figsize = (8, 5))
plt.plot(distances, color = 'red')
plt.ylabel('Distancia al K vecino más cercano')
plt.show()

In [None]:
# --- Selección del punto de curvatura máxima ---
kl = KneeLocator(range(len(distances)), distances, curve = 'convex', direction = 'increasing')
kl.elbow

In [None]:
# --- Epsilon óptimo --
epsilon = distances[kl.elbow]
epsilon

In [None]:
# --- Selección de hiperparámetros min_sample ---
silhoutte_coefs = []

for min_points in [5, 8, 10, 15, 20, 30]:
    model_d = DBSCAN(eps = epsilon, min_samples = min_points)
    model_d.fit(X_pca)
    silhoutte_coefs.append(silhouette_score(X_pca, model_d.labels_))

plt.figure(figsize = (12, 8))
plt.plot([5, 8, 10, 15, 20, 30], silhoutte_coefs, color = 'r', marker = '^')
plt.xlabel('Muestras mínimas de cada cluster')
plt.ylabel('Silhoutte Score')
plt.title('Silhoutte Score - Min Samples de Clusters')
plt.show()

In [None]:
# --- Modelo DBSCAN ---
model_db_pca = DBSCAN(eps = 2.65, min_samples = 7)
model_db_pca.fit(X_pca)

In [None]:
# --- Evaluación del modelo DBSCAN ---
print(" ### DBSCAN ###")
print('Silhouette Score: ', silhouette_score(X_pca, model_db_pca.labels_))
print('Calinski harabasz score: ', calinski_harabasz_score(X_pca, model_db_pca.labels_))

In [None]:
# Predicción de los clusters
Cluster_5 = model_db_pca.fit_predict(X_pca)

df_final['Cluster_5']= Cluster_5

# A partir de ahora, sólo trabajaremos con "df_final"
df_final.head(3)

In [None]:
df_final['Cluster_5'].value_counts()

In [None]:
# 3d scatterplot using matplotlib

fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pca[Cluster_4 == 0,0],X_pca[Cluster_4 == 0,1],X_pca[Cluster_4 == 0,2], s = 40 , color = 'blue', label = "cluster 0")
ax.scatter(X_pca[Cluster_4 == -1,0],X_pca[Cluster_4 == -1,1],X_pca[Cluster_4 == -1,2], s = 40 , color = 'orange', label = "outliers")
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.legend()
plt.show()

## Clustering Algorithm: Gaussian Mixture Model

### Dataset Original

In [None]:
# --- Estimación de hiperparámetros ---

n_components = np.arange(2, 12)

models_g = [GaussianMixture(n_components=n, random_state=123).fit(train_df1_out) for n in n_components]

plt.plot(n_components, [m.bic(train_df1_out) for m in models_g], label="BIC")
plt.plot(n_components, [m.aic(train_df1_out) for m in models_g], label="AIC")
plt.legend()
plt.xlabel("Número de Clusters")

In [None]:
# --- Modelo GMM ---
model_gmm = GaussianMixture(n_components=6, random_state=123, covariance_type = 'full').fit(train_df1_out)

In [None]:
# --- Evaluación del modelo GMM ---
labels_ = model_gmm.predict(train_df1_out)

print(" ### DBSCAN ###")
print('Silhouette Score: ', silhouette_score(train_df1_out, labels_))
print('Calinski harabasz score: ', calinski_harabasz_score(train_df1_out, labels_))

In [None]:
# Predicción de los clusters
Cluster_6 = model_gmm.fit_predict(train_df1_out)

df_final['Cluster_6']= Cluster_6

# A partir de ahora, sólo trabajaremos con "df_final"
df_final.head(3)

In [None]:
df_final['Cluster_6'].value_counts()

### Dataset reducido

In [None]:
# --- Estimación de hiperparámetros ---

n_components = np.arange(2, 20)

models_g = [GaussianMixture(n_components=n, random_state=123).fit(X_pca) for n in n_components]

plt.plot(n_components, [m.bic(X_pca) for m in models_g], label="BIC")
plt.plot(n_components, [m.aic(X_pca) for m in models_g], label="AIC")
plt.legend()
plt.xlabel("Número de Clusters")

In [None]:
# --- Modelo GMM ---
model_gmm_pca = GaussianMixture(n_components=5, random_state=123, covariance_type = 'full').fit(X_pca)

In [None]:
# --- Evaluación del modelo GMM ---
labels_ = model_gmm_pca.predict(X_pca)

print(" ### DBSCAN ###")
print('Silhouette Score: ', silhouette_score(X_pca, labels_))
print('Calinski harabasz score: ', calinski_harabasz_score(X_pca, labels_))

In [None]:
# Predicción de los clusters
Cluster_7 = model_gmm_pca.fit_predict(X_pca)

df_final['Cluster_7'] = Cluster_7

# A partir de ahora, sólo trabajaremos con "df_final"
df_final.head(3)

In [None]:
df_final['Cluster_7'].value_counts()

In [None]:
# 3d scatterplot using matplotlib

fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pca[Cluster_7 == 0,0],X_pca[Cluster_7 == 0,1],X_pca[Cluster_7 == 0,2], s = 40 , color = 'blue', label = "cluster 0")
ax.scatter(X_pca[Cluster_7 == 1,0],X_pca[Cluster_7 == 1,1],X_pca[Cluster_7 == 1,2], s = 40 , color = 'orange', label = "cluster 1")
ax.scatter(X_pca[Cluster_7 == 2,0],X_pca[Cluster_7 == 2,1],X_pca[Cluster_7 == 2,2], s = 40 , color = 'green', label = "cluster 2")
ax.scatter(X_pca[Cluster_7 == 3,0],X_pca[Cluster_7 == 3,1],X_pca[Cluster_7 == 3,2], s = 40 , color = 'purple', label = "cluster 3")
ax.scatter(X_pca[Cluster_7 == 4,0],X_pca[Cluster_7 == 4,1],X_pca[Cluster_7 == 4,2], s = 40 , color = 'red', label = "cluster 4")
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.legend()
plt.show()

In [None]:
# 3d scatterplot using matplotlib

fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pca[Cluster_7 == 0,0],X_pca[Cluster_7 == 0,1], s = 40 , color = 'blue', label = "cluster 0")
ax.scatter(X_pca[Cluster_7 == 1,0],X_pca[Cluster_7 == 1,1], s = 40 , color = 'orange', label = "cluster 1")
ax.scatter(X_pca[Cluster_7 == 2,0],X_pca[Cluster_7 == 2,1], s = 40 , color = 'green', label = "cluster 2")
ax.scatter(X_pca[Cluster_7 == 3,0],X_pca[Cluster_7 == 3,1], s = 40 , color = 'purple', label = "cluster 3")
ax.scatter(X_pca[Cluster_7 == 4,0],X_pca[Cluster_7 == 4,1], s = 40 , color = 'red', label = "cluster 4")
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.legend()
plt.show()