In [38]:
import numpy as np
import math as m
import scipy as sc
import pandas as pd
from sklearn.datasets import make_blobs
import plotly.express as px

import logging
import sys

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)

In [39]:
COLORS = [
    "#f7dc6f",
    "#82e0aa",
    "#f1948a",
    "#499cef",
    "#f5b041",
    "#a569bd",
    "#e74c3c",
    "#2ecc71",
    "#3498db",
    "#e67e22",
    "#9b59b6",
    "#1abc9c",
    "#34495e",
    "#d35400",
    "#c0392b",
    "#16a085",
    "#2980b9",
    "#8e44ad",
]

def twospirals(n_points, noise=.5):
    n = np.sqrt(np.random.rand(n_points,1)) * 780 * (2*np.pi)/360
    d1x = -np.cos(n)*n + np.random.rand(n_points,1) * noise
    d1y = np.sin(n)*n + np.random.rand(n_points,1) * noise
    return (np.vstack((np.hstack((d1x,d1y)),np.hstack((-d1x,-d1y)))), np.hstack((np.zeros(n_points, dtype=int),np.ones(n_points, dtype=int))))

def plot_dataframe(df, title='', x='x', y='y', label='label'):
    df[label] = df[label].astype(str)
    fig = px.scatter(
        df, 
        x=x, 
        y=y, 
        symbol=label,
        color=label,
        color_discrete_sequence= COLORS, 
        title=title
    )
    fig.update_traces(marker=dict(size=5, symbol='circle'))

    return fig

def plot_centroid(fig, centroid, colors = ['#a569bd'], marker_size=10, symbol = 'star', name = 'Centroid'):
    fig.add_scatter(
        x=[centroid[0]], 
        y=[centroid[1]], 
        mode='markers',
        marker=dict(
            size = marker_size,
            color = colors,
            symbol = symbol,
        ), 
        name=name
    )
    return fig

def create_constraints(labels, probability=0.01, seed=0):
    n_points = len(labels)
    constraints = np.zeros((n_points, n_points), dtype=int)
    state = np.random.RandomState(seed=seed)
    for i in range(n_points):
        for j in range(i +1, n_points):
            if state.rand() < probability:
                if labels[i] == labels[j]:
                    constraints[i, j] = 1
                    constraints[j, i] = 1
                elif labels[i] != labels[j]:
                    constraints[i, j] = -1
                    constraints[j, i] = -1
    return constraints

In [62]:
X, y = make_blobs(n_samples=300, centers=5, cluster_std=0.5, random_state=42)
constraints = create_constraints(y, probability=0.01, seed=42)
# df = pd.DataFrame(X, columns=['x', 'y'])
# df['label'] = y
# fig = plot_dataframe(df, title='Data with Constraints', x='x', y='y', label='label')
# fig = plot_centroid(fig, np.mean(X, axis=0), colors=['#a569bd'], marker_size=10, symbol='star', name='Centroid')
# fig.show()

In [63]:
from clustlib.kmean.rdpmean import RDPM
from clustlib.utils.simpleconstraints import SimpleConstraints

rdpm = RDPM(constraints=SimpleConstraints(constraints), n_clusters = 4, max_iter = 1000, tol = 1e-10, limit=2, rate = 2)
rdpm.fit(X)


2025-07-15 13:14:16,094 - DEBUG - Fitting RDPM model
2025-07-15 13:14:16,095 - DEBUG - Delta is None, convergence cannot be checked.
2025-07-15 13:14:16,099 - DEBUG - Instance 0 exceeds limit, creating new cluster
2025-07-15 13:14:16,101 - DEBUG - Instance 1 exceeds limit, creating new cluster
2025-07-15 13:14:16,105 - DEBUG - Instance 5 exceeds limit, creating new cluster
2025-07-15 13:14:16,109 - DEBUG - Instance 7 exceeds limit, creating new cluster
2025-07-15 13:14:16,153 - DEBUG - Instance 92 exceeds limit, creating new cluster
2025-07-15 13:14:16,233 - DEBUG - Iteration 1 completed with clusters: 9
2025-07-15 13:14:16,234 - DEBUG - Centroid 2 is empty, marking for removal
2025-07-15 13:14:16,234 - DEBUG - Centroid 3 is empty, marking for removal
2025-07-15 13:14:16,234 - DEBUG - Checking convergence with delta: [0. 0.]
2025-07-15 13:14:16,235 - DEBUG - Tolerance: 1e-10
2025-07-15 13:14:16,235 - DEBUG - Convergence reached, stopping criteria met.


In [66]:
rdpm_centroids = rdpm.centroids
labels = rdpm._labels
rdpm_centroids = np.array([rdpm_centroids[i] for i in np.unique(labels)])
rdpm_centroids

array([[ 4.60012508,  0.64132159],
       [ 3.88827482,  2.41811828],
       [-1.56610467,  9.10157503],
       [ 1.28300711,  4.73332858],
       [-8.89449178,  7.21303812],
       [-6.84172491, -7.21869045],
       [-3.81907018,  9.42523738]])

In [67]:
rdpm_centroids = rdpm.centroids
df = pd.DataFrame(X, columns=['x', 'y'])
df['label'] = rdpm._labels
fig = plot_dataframe(df, title='RDPM Clustering with Constraints', x='x', y='y', label='label')
for i, centroid in enumerate(rdpm_centroids):
    fig = plot_centroid(fig, centroid, colors=["#34495e"], marker_size=10, symbol='star', name=f'Centroid {i+1}')
fig.update_layout(showlegend=False)
fig.show()