In [1]:
import numpy as np
import plotly.io as pio
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from itertools import combinations

pio.templates.default = 'ggplot2'

In [183]:
def calculate_distance(x_train_scaled, true_labels, n_clusters):
    centroids = []
    for c in range(n_clusters):
        centroids.append(x_train_scaled[true_labels == c].mean(axis=0))

    avg_distance = 0
    combis = list(combinations(centroids, 2))
    for c1, c2 in combis:
        avg_distance += np.linalg.norm(c1 - c2) / len(combis)

    return avg_distance

In [236]:
n_tests = 20
tests = range(1, n_tests + 1)

In [237]:
_n_samples = np.random.choice([100, 1000, 10000, 100000], p=[0.8, 0.15, 0.04, 0.01], size=n_tests)
_n_samples

array([1000,  100,  100,  100,  100,  100, 1000,  100,  100,  100,  100,
        100,  100,  100,  100, 1000,  100,  100,  100, 1000])

In [238]:
_n_clusters = np.random.choice(range(3, 21), p=np.arange(37, 2, -2) / 360, size=n_tests)
_n_clusters

array([ 7, 16, 12,  7,  6,  3,  9, 14, 12,  5,  9,  3, 14, 14,  7,  3, 14,
        8, 10,  5])

In [239]:
boxes_size = np.random.uniform(1.5, 20, n_tests)
boxes_size

array([ 7.02226103, 12.04717198,  1.98906162, 14.51410065,  3.60503013,
        4.84384429, 18.87015512, 15.466119  ,  8.85061458, 15.55030627,
        2.6558965 ,  6.99090029, 13.91235506, 10.26408431, 12.90602782,
        8.18145759, 16.95236204, 15.34410019,  1.53064875,  7.93438107])

In [240]:
_noise_proportion = np.random.choice([0.01, 0.05, 0.1], size=n_tests)
_noise_proportion

array([0.01, 0.05, 0.05, 0.01, 0.05, 0.05, 0.05, 0.1 , 0.01, 0.1 , 0.05,
       0.05, 0.01, 0.05, 0.01, 0.01, 0.05, 0.01, 0.1 , 0.01])

In [241]:
_std = np.random.choice([0.5, 1, 2, 3], size=n_tests)
_std

array([1. , 0.5, 3. , 2. , 0.5, 0.5, 1. , 2. , 2. , 3. , 1. , 2. , 2. ,
       2. , 1. , 3. , 1. , 2. , 0.5, 1. ])

In [242]:
_noise_type = np.random.choice(['specific', 'random'], size=n_tests)
_noise_type

array(['specific', 'specific', 'specific', 'random', 'specific',
       'specific', 'specific', 'random', 'random', 'specific', 'random',
       'specific', 'specific', 'specific', 'specific', 'random', 'random',
       'specific', 'specific', 'random'], dtype='<U8')

In [243]:
_cluster_type = np.random.choice(['standard', 'anisotropic', 'variance_diff', 'unbalanced'], p=[0.7, 0.1, 0.1, 0.1], size=n_tests)
_cluster_type

array(['variance_diff', 'standard', 'unbalanced', 'standard', 'standard',
       'standard', 'anisotropic', 'unbalanced', 'standard', 'standard',
       'standard', 'standard', 'standard', 'variance_diff', 'standard',
       'standard', 'standard', 'unbalanced', 'anisotropic', 'standard'],
      dtype='<U13')

In [244]:
variations = [
    tests,
    _n_samples,
    _n_clusters,
    boxes_size,
    _std,
    _noise_type,
    _cluster_type
]

In [261]:
fig = make_subplots(rows=n_tests, cols=2, subplot_titles=['placeholder'] * n_tests * 2)
for i, n_samples, n_clusters, box_size, std, noise_type, cluster_type in zip(*variations):
    if cluster_type == 'variance_diff':
        std = np.random.uniform(0.5, 2.5, n_clusters)

    x_train, true_labels = make_blobs(
        n_samples=n_samples,
        centers=n_clusters,
        cluster_std=std,
        center_box=[-box_size, box_size],
    )

    if cluster_type == 'anisotropic':
        theta = np.radians(np.random.randint(20, 50))
        t = np.tan(theta)
        if np.random.choice([True, False]):
            shear = np.array(((1, t), (0, 1))).T
        else:
            shear = np.array(((1, 0), (t, 1))).T
        x_train = x_train.dot(shear)

    elif cluster_type == 'unbalanced':
        x_train = np.vstack(list(
            x_train[true_labels == i][:int(n_samples / n_clusters / (i + 1)) + 2]
            for i in range(n_clusters)
        ))
        true_labels = np.vstack(list(
            true_labels[true_labels == i][:int(n_samples / n_clusters / (i + 1)) + 2].reshape(-1, 1)
            for i in range(n_clusters)
        )).flatten()

    x_train_scaled = StandardScaler().fit_transform(x_train)

    fig.add_traces(
        px.scatter(
            x=x_train_scaled[:, 0], y=x_train_scaled[:, 1],
            color=true_labels.astype(str), color_discrete_sequence=px.colors.qualitative.Dark24
        ).data, cols=1, rows=i
    )

    avg_distance = round(calculate_distance(x_train_scaled, true_labels, n_clusters), 3)
    fig.layout.annotations[(i - 1) * 2].update(text=f'Avg distance: {avg_distance} - {cluster_type}')
    fig.layout.annotations[(i - 1) * 2 + 1].update(text=f'Avg distance: {avg_distance} - {cluster_type}')

    if noise_type == 'random':
        n_noisy_samples = int(n_samples * np.random.choice([0.01, 0.05, 0.1]))
        noise = (np.random.random((n_noisy_samples, 2)) - 0.5) * box_size * 2
        noise_labels = np.random.randint(0, n_clusters, n_noisy_samples)

    else:
        noise, noise_labels = make_blobs(
            n_samples=np.random.randint(1, 5),
            centers=[np.random.uniform(box_size * 1.2, box_size * 2, 2)]
        )

    x_train = np.append(x_train, noise, axis=0)
    true_labels = np.append(true_labels, noise_labels)
    x_train_scaled = StandardScaler().fit_transform(x_train)

    fig.add_traces(
        px.scatter(
            x=x_train_scaled[:, 0], y=x_train_scaled[:, 1],
            color=true_labels.astype(str), color_discrete_sequence=px.colors.qualitative.Dark24
        ).data, cols=2, rows=i
    )

In [262]:
# fig.update_layout(height=200 * n_tests).update_coloraxes(showscale=False)
fig.update_layout(height=200 * n_tests, showlegend=False)
# fig.write_html('test.html')
fig.show()