In [1]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import os
import numpy as np
from tqdm import tqdm

In [2]:
def plot_generated_data(Xc, Xn, N, noise_ratio, title=None):
    """
    Plot clustered points and uniform noise.
    
    Parameters:
        Xc: ndarray of clustered points (N x 2)
        Xn: ndarray of uniform noise points (M x 2)
        title: optional plot title
    """
    plt.figure(figsize=(6, 6))
    if len(Xc) > 0:
        plt.scatter(Xc[:, 0], Xc[:, 1], s=10, color="C0", label="Clustered points")
    if len(Xn) > 0:
        plt.scatter(Xn[:, 0], Xn[:, 1], s=10, color="C1", label="Uniform noise")
    
    plt.legend()
    plt.xlabel("x")
    plt.ylabel("y")
    plt.title(title or "Generated Data (Clusters + Noise)")
    plt.axis("equal")
    plt.grid(True)
    plt.tight_layout()
    # plt.show()

    os.makedirs('plots', exist_ok=True)
    noise_ratio_file_title = str(noise_ratio).replace('.', '_') # No '.' in the filename just to be sure
    plt.savefig(f'plots/data_{N}_{noise_ratio_file_title}.pdf')
    plt.close()

In [3]:
eps = 0.5
min_pts = 5

# Sizes and noise levels to test
# grid_N = [1_000, 10_000, 100_000, 500_000]
# noise_ratios = [0.0, 0.2, 0.5]    # fraction of points uniformly at random
grid_N = [1_000, 10_000]
noise_ratios = [0.0, 0.2, 0.5]

results = []
total = len(grid_N) * len(noise_ratios)
p_bar = tqdm(desc='Generating plots...', total=total)
for N in grid_N:
    for noise_frac in noise_ratios:
        p_bar.update(1)
        p_bar.set_postfix_str(f'N={N}; noise={noise_frac}')
        n_noise = int(N * noise_frac)
        n_clusters = N - n_noise
        # Generate clustered data
        Xc, _ = make_blobs(n_samples=n_clusters,
                           centers=[(-5,-5),(5,5),(5,-5),(-5,5)],
                           cluster_std=1.0, random_state=42)
        # Generate uniform noise
        Xn = np.random.uniform(low=-10, high=10, size=(n_noise, 2))
        X = np.vstack([Xc, Xn])

        plot_generated_data(Xc, Xn, N, noise_frac, title=f'Generated data for N={N} and noise ratio={noise_frac}')

print('Plots saved to /plots folder!')

Generating pltos...: 100%|██████| 6/6 [00:00<00:00,  7.88it/s, N=10000; noise=0.5]

Plots saved to /plots folder!
