###**K-means**  

- Assign data points to nearest cluster. Compute distances between each data point - handle these distance calculations with GPU.
- Update cluster centroids. Recalculate the centroids taking means and using the parallel reduction algorithms.
- Check and repeat until convergence.

Perform K-means clustering using cuML. Reduce dimensionality using cuML UMAP

####GPU K-Means

In [None]:
#%load_ext cuml.accel
from sklearn.datasets import make_blobs
from cuml.cluster import KMeans # Changed to import KMeans from cuML
from time import time
import cupy as cp

n_samples = 1_000_000
n_features = 10
n_clusters = 5
random_state = 42
n_init = 10
# generate synthetic data
X, _ = make_blobs(n_samples = n_samples, centers = n_clusters, n_features = n_features, random_state = random_state)
print(f'Data shape {X.shape}')
# convert NumPy array to a CuPy array for GPU processing
X_gpu = cp.asarray(X)
start_time = time()
# initialize cuML's KMeans
kmeans = KMeans(n_clusters = n_clusters, random_state = random_state, n_init = n_init)
kmeans.fit(X_gpu)
cluster_labels = kmeans.labels_
end_time = time()
# convert the cluster centers from a CuPy array back to a NumPy array
cluster_centers = cp.asnumpy(kmeans.cluster_centers_)
inertia = kmeans.inertia_

Data shape (1000000, 10)


####cuML K-Means with cuML UMAP

In [None]:
# generate the data
X, labels = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, cluster_std=0.5, random_state=random_state)
# covert data to PGU array using CuPy
X_gpu = cp.asarray(X)
kmeans = KMeans(n_clusters=5, random_state=42, n_init=50)
kmeans.fit(X_gpu.get())
cluster_labels = kmeans.labels_
# reduce dimensionality to 3D with cuML UMAP
umap3D = UMAP(n_neighbors=15, min_dist=0.1, n_components=3)
embedding_3d = umap3D.fit_transform(X_gpu.get())
# convert GPU array back to NumPy for visualization
embedding_3d = cp.asnumpy(embedding_3d)
cluster_labels_cpu = cp.asnumpy(cluster_labels)

# interactive visualization
import plotly.express as px
fig = px.scatter_3d(
    x=embedding_3d[:, 0],
    y=embedding_3d[:, 1],
    z=embedding_3d[:, 2],
    color=cluster_labels_cpu.astype(str),
    title="3D UMAP Visualization with K-Means Clusters",
    labels={'color': 'Cluster'}
)

fig.update_traces(marker=dict(size=5))
fig.update_layout(scene=dict(
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    zaxis_title='UMAP Dimension 3',
))

fig.show()