Analytics & Data Science

Universidad de Antioquia - ML2

Febrero 2024

Melissa Ortega Alzate CC.1036964792

# Libraries

In [46]:
import numpy as np

# Paso 1

In [47]:
# Step 1. Randomly initializing K centroid by picking K samples from X
def initialize_random_centroids(K, X):
    """Initializes and returns k random centroids"""
    
    m, n = np.shape(X)
    
    # Initializate empty centroids variable with (K, n) shape
    centroids = np.empty((K, n))

    for i in range(K):
        # pick a random data point from X as the centroid
        centroids[i] =  X[np.random.choice(range(m))] 
    return centroids

In [48]:
np.random.seed(42)

X = np.random.rand(8,5)
K = 4
X

array([[0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864],
       [0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258],
       [0.02058449, 0.96990985, 0.83244264, 0.21233911, 0.18182497],
       [0.18340451, 0.30424224, 0.52475643, 0.43194502, 0.29122914],
       [0.61185289, 0.13949386, 0.29214465, 0.36636184, 0.45606998],
       [0.78517596, 0.19967378, 0.51423444, 0.59241457, 0.04645041],
       [0.60754485, 0.17052412, 0.06505159, 0.94888554, 0.96563203],
       [0.80839735, 0.30461377, 0.09767211, 0.68423303, 0.44015249]])

In [49]:
centroids = initialize_random_centroids(K, X)
centroids

array([[0.60754485, 0.17052412, 0.06505159, 0.94888554, 0.96563203],
       [0.18340451, 0.30424224, 0.52475643, 0.43194502, 0.29122914],
       [0.61185289, 0.13949386, 0.29214465, 0.36636184, 0.45606998],
       [0.80839735, 0.30461377, 0.09767211, 0.68423303, 0.44015249]])

# Paso 2

In [50]:
# Calculate the distance between two vectors
def euclidean_distance(x1, x2):
    """Calculates and returns the euclidean distance between two vectors x1 and x2"""
    return np.linalg.norm(x1 - x2)

In [51]:
# Step 2. 
def closest_centroid(x, centroids, K):
    """Finds and returns the index of the closest centroid for a given vector x"""
    distances = np.empty(K)

    for i in range(K):
        distances[i] = euclidean_distance(centroids[i], x)
    return np.argmin(distances) # return the index of the lowest distance

In [52]:
closest_centroid = closest_centroid(X[1], centroids, K)
closest_centroid

1

In [53]:
centroids[closest_centroid]

array([0.18340451, 0.30424224, 0.52475643, 0.43194502, 0.29122914])

In [54]:
def create_clusters(centroids, K, X):
    """Returns an array of cluster indices for all the data samples"""
    m, _ = np.shape(X)
    cluster_idx = np.empty(m)
    
    def closest_centroid(x, centroids, K):
        """Finds and returns the index of the closest centroid for a given vector x"""
        distances = np.empty(K)
        for i in range(K):
            distances[i] = euclidean_distance(centroids[i], x)
        return np.argmin(distances) # return the index of the lowest distance

    for i in range(m):
        cluster_idx[i] = closest_centroid(X[i], centroids, K)
    return cluster_idx

In [55]:
clusters = create_clusters(centroids, K, X)
clusters

array([1., 1., 1., 1., 2., 2., 0., 3.])

In [56]:
from unsupervised.clusters.kmeans import KMEANS

# Create a KMeans object
kmeans = KMEANS()

# Create clusters
cluster_indices = kmeans.fit(K, X)

Initial centroids:
 [[0.61185289 0.13949386 0.29214465 0.36636184 0.45606998]
 [0.80839735 0.30461377 0.09767211 0.68423303 0.44015249]
 [0.15599452 0.05808361 0.86617615 0.60111501 0.70807258]
 [0.18340451 0.30424224 0.52475643 0.43194502 0.29122914]]

Final centroids:
 [[0.69851443 0.16958382 0.40318954 0.47938821 0.2512602 ]
 [0.7079711  0.23756895 0.08136185 0.81655928 0.70289226]
 [0.15599452 0.05808361 0.86617615 0.60111501 0.70807258]
 [0.19284304 0.74162213 0.69639767 0.4143142  0.20969092]]



In [57]:
cluster_indices


array([3., 2., 3., 3., 0., 0., 1., 1.])

cluster_indices es un array de NumPy que contiene los índices de clúster para todas las muestras de datos en X. Cada elemento del array corresponde a una muestra (fila) de datos en X y contiene el índice del clúster al que pertenece esa muestra. Por lo tanto, cluster_indices es un array unidimensional. Cada valor en este array representa el índice del clúster asignado a la muestra de datos correspondiente en X.

entonces a la fila 1, le toco el cluster 0
a la fila 2

In [58]:
np.random.seed(42)

X = np.random.rand(5,5)
K = 4
X

array([[0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864],
       [0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258],
       [0.02058449, 0.96990985, 0.83244264, 0.21233911, 0.18182497],
       [0.18340451, 0.30424224, 0.52475643, 0.43194502, 0.29122914],
       [0.61185289, 0.13949386, 0.29214465, 0.36636184, 0.45606998]])

In [59]:
cluster_idx = kmeans.predict(X)

In [60]:
cluster_idx

array([3., 2., 3., 3., 0.])