In [1]:
import numpy as np
import time
from sklearn.cluster import KMeans

In [2]:
# Algorithm & pseudocode for the k-means algorithm.
# step 1: randomly initialize K cluster centers, initialize iteration to 0
# step 2: repeat untill convergence or iteration reaches max iteration
#     increment iteration by 1
#     step 2-1: for each data point x in X: assign x to the nearest cluster center
#     step 2-2: for each cluster center: update the center to the mean of assigned data points
#     step 2-3: check for convergence
# end

In [3]:
# Implementaion of the custom KMeans function:
def Kmeans(X, K, iter_max = 300):
	np.random.seed(0)
	# step 1:
	centers = X[np.random.choice(range(len(X)), size=K, replace=False)]
	
	# step 2:
	for iter in range(iter_max):
		# step 2-1:
		labels = np.argmin(np.linalg.norm(X[:, np.newaxis] - centers, axis=2), axis=1)
		# step 2-2:
		new_centers = np.array([X[labels==k].mean(axis=0) for k in range(K)])
		# step 2-3:
		if np.all(centers==new_centers):
			break
		centers = new_centers
	return labels, centers

In [8]:
# Comparing execution time between sklearn's kmeans and the custom kmeans above.
# Create a dataset for testing.
dataset = np.random.rand(100, 3)
K = 3
iter_max = 300


# Record the execution time for custom kmeans
start_time_custom = time.time()
labels,centers = Kmeans(dataset, K, iter_max)
custom_kmeans_execution_time = time.time() - start_time_custom


# Record the execution time for sklearn's kmeans
start_time_sklearn = time.time()
kmeans_sklearn = KMeans(n_clusters=K, max_iter = iter_max, random_state=0, n_init=1, init='random').fit(dataset)
sklearn_kmeans_execution_time = time.time() - start_time_sklearn

print("Custom kmeans execution time: ", custom_kmeans_execution_time, " seconds;")
print("sklearn kmeans execution time: ", sklearn_kmeans_execution_time, " seconds;")


Custom kmeans execution time:  0.0031287670135498047  seconds;
sklearn kmeans execution time:  0.008269071578979492  seconds;
