In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs

In [2]:
COLORS = [
    "#f7dc6f",
    "#82e0aa",
    "#f1948a",
    "#499cef",
    "#f5b041",
    "#a569bd",
    "#e74c3c"
]

In [3]:
def twospirals(n_points, noise=.5):
    n = np.sqrt(np.random.rand(n_points,1)) * 780 * (2*np.pi)/360
    d1x = -np.cos(n)*n + np.random.rand(n_points,1) * noise
    d1y = np.sin(n)*n + np.random.rand(n_points,1) * noise
    return (np.vstack((np.hstack((d1x,d1y)),np.hstack((-d1x,-d1y)))), np.hstack((np.zeros(n_points, dtype=int),np.ones(n_points, dtype=int))))

In [4]:
import plotly.express as px

def plot_dataframe(df, title='', x='x', y='y', label='label'):
    df[label] = df[label].astype(str)
    fig = px.scatter(
        df, 
        x=x, 
        y=y, 
        symbol=label,
        color=label,
        color_discrete_sequence= COLORS, 
        title=title
    )
    fig.update_traces(marker=dict(size=5))

    return fig

In [5]:
data, constraints = twospirals(1000, noise=0.5)

data = np.hstack((data, constraints.reshape(-1, 1)))
np.random.shuffle(data)
df = pd.DataFrame(data, columns=['x', 'y', 'label'])

In [6]:
fig = plot_dataframe(df, title='Two Spirals Dataset', label='label')
fig.show()

In [7]:
def plot_centroid(fig, centroid, colors = ['#a569bd'], marker_size=10, symbol = 'star', name = 'Centroid'):
    fig.add_scatter(
        x=[centroid[0]], 
        y=[centroid[1]], 
        mode='markers',
        marker=dict(
            size = marker_size,
            color = colors,
            symbol = symbol,
        ), 
        name=name
    )
    return fig

In [8]:
fig = plot_centroid(fig, [0, 0])  # Example centroids
fig = plot_centroid(fig, [1, 1], colors=['#f5b041'], marker_size=10, symbol='cross')
fig.show()

In [9]:
X, y = make_blobs(n_samples=300, centers=[[-1, 1], [1, 1], [1, -1], [-1, -1]], cluster_std=2, random_state=0)

In [10]:
df = np.hstack((X, y.reshape(-1, 1)))
df = pd.DataFrame(df, columns=['x', 'y', 'label'])

In [11]:
fig = plot_dataframe(df, title='Blobs Dataset', label='label')

fig = plot_centroid(fig, [-1, 1], colors=["#f5a016"], marker_size=10, symbol='star', name='Centroid 1')
fig = plot_centroid(fig, [1, 1], colors=["#3bf335"], marker_size=10, symbol='cross', name='Centroid 2')
fig = plot_centroid(fig, [1, -1], colors=["#f83a25"], marker_size=10, symbol='triangle-up', name='Centroid 3')
fig = plot_centroid(fig, [-1, -1], colors=["#2286f8"], marker_size=10, symbol='triangle-down', name='Centroid 4')

fig.update_layout(
    title='Blobs Dataset with Centroids',
    xaxis_title='X-axis',
    yaxis_title='Y-axis',
    legend_title='Label'
)
fig.show()

In [12]:
def create_constraints(labels, probability=0.01, seed=0):
    n_points = len(labels)
    constraints = np.zeros((n_points, n_points), dtype=int)
    state = np.random.RandomState(seed=seed)
    for i in range(n_points):
        for j in range(i +1, n_points):
            if state.rand() < probability:
                if labels[i] == labels[j]:
                    constraints[i, j] = 1
                    constraints[j, i] = 1
                elif labels[i] != labels[j]:
                    constraints[i, j] = -1
                    constraints[j, i] = -1
    return constraints

In [13]:
# np.mean(np.sum(np.abs(create_constraints(labels=y, probability=0.15, seed=42)), axis=1) / 2)

In [14]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)

In [15]:
#del sys.modules['clustlib.kmean.copkmeans']
from clustlib.kmean.copkmeans import COPKMeans

logging.info("Generating synthetic data for COP-KMeans clustering...")

constraints = create_constraints(labels=y, probability=0.15, seed=42)

cop = COPKMeans(constraints, n_clusters=4, max_iter=100, tol=1e-4)
cop.fit(X)
centroids = cop.centroids

2025-07-15 13:35:10,771 - INFO - Generating synthetic data for COP-KMeans clustering...


In [32]:
fig = plot_dataframe(df, title='Blobs Dataset', label='label')
for i, centroid in enumerate(centroids):
    logging.info(f"Centroid {i}: {centroid}")
    labels = np.argwhere(cop._labels == i).flatten()
    logging.info(f"Labels for centroid {i}: {len(labels)}")
    fig = plot_centroid(fig, centroid, colors=["#34495e"], marker_size=10, symbol='star', name=f'Centroid {i+1}')

2025-07-15 13:36:24,519 - INFO - Centroid 0: [-0.15443205 -0.50856417]
2025-07-15 13:36:24,519 - INFO - Labels for centroid 0: 124
2025-07-15 13:36:24,520 - INFO - Centroid 1: [0.50306931 1.0729136 ]
2025-07-15 13:36:24,521 - INFO - Labels for centroid 1: 29
2025-07-15 13:36:24,522 - INFO - Centroid 2: [ 0.77986079 -1.93514541]
2025-07-15 13:36:24,522 - INFO - Labels for centroid 2: 55
2025-07-15 13:36:24,523 - INFO - Centroid 3: [-0.97402704  1.34476667]
2025-07-15 13:36:24,523 - INFO - Labels for centroid 3: 92


In [33]:
fig.update_layout(showlegend=False, title='COP-KMeans')
fig.show()

In [18]:
# cumulative = np.cumsum(np.sum(res, 0))
# aggregate = np.sum(res, 0)

In [19]:
# gamma[:-1, 0] = aggregate[:-1] + 1

In [20]:
# gamma[:-1, 1] = np.sum(cumulative[-1:] - cumulative, initial = 1.2)

In [21]:
# (cumulative[-1:] - cumulative) + 1.2

In [22]:
# gamma[:, 1]

In [23]:
# res[0,:]

In [24]:
# np.sum(np.array(res), 0)

In [25]:
# np.cumsum(np.sum(res, 0))

In [26]:
# np.finfo(float).eps

## Verosimilitud de los datos

In [27]:
# def concentration(self, cluster, p, __nu, __cov_inverse):
#     """
#     This measure is used to determine how "concentrated" a Gaussian component (cluster) is around its mean.
#     It is calculated as the sum of the squared Mahalanobis distances between each data point and the mean of the 
#     cluster, weighted by the probability of each data point belonging to that cluster.
#     """
#     sum_phi = np.sum(phi((__nu[cluster] + 1 - np.arange(1, p + 1)) / 2))
#     determinant = np.linalg.det(__cov_inverse[cluster])

#     return sum_phi + p * m.log(2) + m.log(determinant)

# def verosimilitude(cluster, instance, dataset, __responsabilities):
#     x = dataset[instance]
#     p = dataset.shape[1]
#     distance = np.dot(__responsabilities, __responsabilities.T)

#     if distance[cluster] < 1e-20:
#         return 0

#     concentration = concentration(cluster)
#     x_bar_k = np.sum(np.multiply(x, __responsabilities[:, cluster]), 0) / distance[cluster]
#     diff = x - x_bar_k

#     S_k = np.dot(diff.T, np.multiply(diff, __responsabilities[:, cluster])) / distance[cluster]
#     return 0.5 * distance[cluster] * (concentration - expected_distance(x, cluster))

In [28]:

    #     if (is_keep_l == 1) & (iter <= max_iter):
    #         term71 = 0
    #         term72 = 0
    #         term73 = 0
    #         term74 = 0
    #         term76 = 0
    #         term77 = 0

    #         for k in range(K):
    #             if N[k] > 10**(-10):
    #                 E_ln_lambda_k = np.sum(
    #                     sps.digamma((nuQ[k] + 1 - np.array(range(1, p + 1), dtype=np.float)) / 2)) 
    #                 + m.log(np.linalg.det(WQ[k, :, :]))
    #                 x_bar_k = np.sum(np.multiply(X, rQ[:, k]), 0) / N[k]
    #                 S_k = np.dot((X - x_bar_k).T, np.multiply(X - x_bar_k, rQ[:, k])) / N[k]
    #                 term71 = 
    #                     term71 + 
    #                     0.5 * N[k] * 
    #                 (E_ln_lambda_k - 
    #                  p / betaQ[k] - 
    #                  nuQ[k] * np.sum(np.diag(np.dot(S_k, WQ[k, :, :]) ) ) - 
    #                  nuQ[k] * np.dot(np.dot((x_bar_k - muQ[k, :, :].T), WQ[k, :, :]), ((x_bar_k - muQ[k, :, :].T)).T)
    #                 )

    #             if k < K - 1:
    #                 tmp = N[k] * (sps.digamma(gammaQ[k, 0]) - sps.digamma(np.sum(gammaQ[k, :])))
    #                 for j in range(k+1, K):
    #                     tmp = tmp + N[j] * (sps.digamma(gammaQ[k, 1]) - sps.digamma(np.sum(gammaQ[k, :])))

    #                 term72 = term72 + tmp
    #                 term73 = term73 + (alpha0 - 1) * (sps.digamma(gammaQ[k, 1]) - sps.digamma(np.sum(gammaQ[k, :])))

    #             tmp2 = (-p * beta0 / betaQ[k] - beta0 * nuQ[k] * np.dot(np.dot((muQ[k, :, :].T - mu0), WQ[k, :, :]), (((muQ[k, :,:].T - mu0).T) ))) / 2
    #             tmp3 = nuQ[k] * sum(np.diag(np.dot(np.linalg.inv(W0), WQ[k, :, :]))) / 2
    #             term74 = term74 + E_ln_lambda_k * (nu0 - p) / 2 + tmp2 - tmp3

    #             if k < K - 1:
    #                 term76 = term76 + (gammaQ[k, 0] - 1) * (sps.digamma(gammaQ[k, 0]) - sps.digamma(np.sum(gammaQ[k, :]))) + (gammaQ[k, 1] - 1) * (sps.digamma(gammaQ[k, 1]) - sps.digamma(np.sum(gammaQ[k, :]))) - sps.betaln(gammaQ[k, 0], gammaQ[k, 1])

    #             term77 = term77 + E_ln_lambda_k / 2 + p / 2 * m.log(betaQ[k]) - H(WQ[k, :, :], nuQ[k])

    #         ElnP = sps.digamma(alpha_p) - sps.digamma(alpha_p + beta_p)
    #         ElnMinusP = sps.digamma(beta_p) - sps.digamma(alpha_p + beta_p)
    #         ElnQ = sps.digamma(alpha_q) - sps.digamma(alpha_q + beta_q)
    #         ElnMinusQ = sps.digamma(beta_q) - sps.digamma(alpha_q + beta_q)
    #         tmpRQ = np.dot(rQ, rQ.T)
    #         termE_lnPez = np.sum(np.multiply(np.multiply(constraints, checked), tmpRQ)) * ElnP + np.sum(np.multiply(np.multiply(1 - constraints, checked), tmpRQ)) * ElnMinusP + np.sum(np.multiply(np.multiply(constraints, checked), 1 - tmpRQ)) * ElnMinusQ + np.sum(np.multiply(np.multiply(1 - constraints, checked), 1 - tmpRQ)) * ElnQ
    #         termE_ln_pq = (alpha_p_0 - alpha_p) * ElnP + (beta_p_0 - beta_p) * ElnMinusP + (alpha_q_0 - alpha_q) * ElnQ + (beta_q_0 - beta_q) * ElnMinusQ + sps.betaln(alpha_p, beta_p) + sps.betaln(alpha_q, beta_q)

    #         L[iter] = term71 + term72 + term73 + term74 - term75 - term76 - term77 + termE_lnPez + termE_ln_pq

    #         pImprove = 1
    #         if iter > 0:
    #             pImprove = (L[iter] - L[iter - 1]) / np.absolute(L[iter - 1])

    #     if pImprove < stop_thr:
    #         nRun = iter
    #         iter = max_iter + 1

    #     iter += 1

    # membership_vector = np.array([np.argmax(rQ[i, :]) for i in range(np.shape(rQ)[0])], dtype=np.uint8)

    # return membership_vector

In [29]:
# np.dot(array.T, array)

In [30]:
# array

In [31]:
# np.sum(array * array)