In [2]:
import numpy as np
from sklearn.datasets import make_blobs
import plotly.express as px

import logging
import sys

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)

COLORS = [
    "#f7dc6f",
    "#82e0aa",
    "#f1948a",
    "#499cef",
    "#f5b041",
    "#a569bd",
    "#e74c3c",
    "#2ecc71",
    "#3498db",
    "#e67e22",
    "#9b59b6",
    "#1abc9c",
    "#34495e",
    "#d35400",
    "#c0392b",
    "#16a085",
    "#2980b9",
    "#8e44ad",
]

def twospirals(n_points, noise=.5):
    n = np.sqrt(np.random.rand(n_points,1)) * 780 * (2*np.pi)/360
    d1x = -np.cos(n)*n + np.random.rand(n_points,1) * noise
    d1y = np.sin(n)*n + np.random.rand(n_points,1) * noise
    return (np.vstack((np.hstack((d1x,d1y)),np.hstack((-d1x,-d1y)))), np.hstack((np.zeros(n_points, dtype=int),np.ones(n_points, dtype=int))))

def plot_dataframe(df, title='', x='x', y='y', label='label'):
    df[label] = df[label].map(lambda x: str(x))
    fig = px.scatter(
        df.sort_values(by=label),
        x=x,
        y=y,
        color=label,
        color_discrete_map={str(i): COLORS[i] for i in range(len(COLORS))},
        title=title,
    )
    fig.update_traces(marker=dict(size=5, symbol='circle', opacity = 0.6))

    return fig

def plot_centroid(fig, centroid, colors = ['#a569bd'], marker_size=10, symbol = 'star', name = 'Centroid'):
    fig.add_scatter(
        x=[centroid[0]],
        y=[centroid[1]],
        mode='markers',
        marker=dict(
            size = marker_size,
            color = colors,
            symbol = symbol,
        ),
        name=name
    )
    return fig

def create_constraints(labels, probability=0.01, seed=0):
    n_points = len(labels)
    constraints = np.zeros((n_points, n_points), dtype=int)
    state = np.random.RandomState(seed=seed)
    for i in range(n_points):
        for j in range(i +1, n_points):
            if state.rand() < probability:
                if labels[i] == labels[j]:
                    constraints[i, j] = 1
                    constraints[j, i] = 1
                elif labels[i] != labels[j]:
                    constraints[i, j] = -1
                    constraints[j, i] = -1
    return constraints

def draw_cl_constraints(fig, constraints, points):
    n_points = len(points)
    for i in range(n_points):
        for j in range(i + 1, n_points):
            if constraints[i, j] < 0:
                fig.add_scatter(
                    x=[points[i, 0], points[j, 0]],
                    y=[points[i, 1], points[j, 1]],
                    mode='lines',
                    line=dict(color='red', width=0.5, dash='dash'),
                )
    return fig

def draw_ml_constraints(fig, constraints, points):
    n_points = len(points)
    for i in range(n_points):
        for j in range(i + 1, n_points):
            if constraints[i, j] > 0:
                fig.add_scatter(
                    x=[points[i, 0], points[j, 0]],
                    y=[points[i, 1], points[j, 1]],
                    mode='lines',
                    line=dict(color='green', width=0.5),
                )
    return fig

In [2]:
X, y = make_blobs(n_samples=300, centers=10, cluster_std=1.0, random_state=42)
constraints = create_constraints(y, probability=0.01, seed=42)

In [None]:
from clustlib.sin.dils import DILS
dils = DILS(
    n_clusters=10,
    constraints=constraints,
    max_iter=10000,
    tol=1e-4,
)
dils.fit(X)

In [3]:
random_labels = np.random.randint(0, 10, size=len(X))

for j in range(10):
    clust = X[random_labels == j]

    if clust.shape[0] > 0:
        tot = 0.0
        for k in range(clust.shape[0]):
            tot += ((((clust[k+1 :] - clust[k]) ** 2).sum(1)) ** 0.5).sum()

        avg = tot / ((clust.shape[0] - 1) * (clust.shape[0]) / 2.0)
        print(avg)

10.683283932922992
11.137392153849365
10.538894214771286
10.84309126081509
10.841408574633723
8.465295620745753
8.569982434270864
10.370822025732691
10.730643180191164
10.618926586151083


In [4]:
from scipy.spatial.distance import pdist

for j in range(10):
    clust = X[random_labels == j]
    pdistances = pdist(clust)
    print(pdistances.mean())

10.683283932922992
11.137392153849369
10.538894214771284
10.843091260815093
10.841408574633725
8.465295620745753
8.569982434270864
10.370822025732691
10.730643180191162
10.618926586151087


In [5]:
random = np.random.rand(2) * 10
print(random)
np.abs(np.diff(random))[0]

[8.60461481 3.70757689]


4.897037923917564