In [255]:
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances

import cProfile


def getPermutation(totalRange,numberElements):
    random_seed = 10312003
    rng = np.random.RandomState(random_seed)
    permutation = rng.permutation(totalRange)
    return permutation[:numberElements]


def onlineKmeans(X,k=3,b=30,maxiter=1000):
    #print X
    centroids = X[getPermutation(len(X),k)]
    pointsPerClusters = np.zeros([k,1])
    for i in range(maxiter):
        M=X[getPermutation(len(X),b)]
        distances = pairwise_distances(M, centroids, metric='euclidean')
        nearestCenters = np.argmin(distances, axis=1)
        for iter, x in enumerate(M):
            centerIndex = nearestCenters[iter]
            pointsPerClusters[centerIndex] = pointsPerClusters[centerIndex] + 1
            eta = 1/pointsPerClusters[centerIndex]
            centroids[centerIndex] = (1 - eta)*centroids[centerIndex] + eta * x

    return centroids

In [256]:
from timeit import Timer
from functools import partial
from memory_profiler import memory_usage
import timeit

def profile_memory_and_time(function, *args, **kwargs):
    #print "args is ",args
    start_time = timeit.default_timer()
    memory, return_val = memory_usage((function, (args), kwargs), max_usage=True, retval=True)
    elapsed = timeit.default_timer() - start_time
    return memory[0], elapsed,return_val
  

In [263]:
iris = load_iris()
X=iris.data
random_seed = 10312003
rng = np.random.RandomState(random_seed)
permutation = rng.permutation(len(X))
X = X[permutation]
ourMemory,ourTime,ourCentroids= profile_memory_and_time(onlineKmeans, X, k=3, b=33, maxiter=1000)
print ourMemory,ourTime,ourCentroids

41.65625 0.524281978607 [[ 6.57765957  3.0259482   5.43008458  1.98821197]
 [ 6.04293513  2.8614828   4.35799162  1.38494351]
 [ 4.88071879  3.35022525  1.440905    0.18031835]]


In [264]:
from sklearn.cluster import MiniBatchKMeans
minibatch=MiniBatchKMeans(n_clusters=3,max_iter=100,batch_size=33)
memory, time, rval = profile_memory_and_time(minibatch.fit,X)
print memory, time, rval.cluster_centers_

41.65625 0.115921974182 [[ 5.01335505  3.41433225  1.46091205  0.247557  ]
 [ 5.88854167  2.73880208  4.35651042  1.40989583]
 [ 6.87467811  3.08798283  5.74892704  2.03562232]]


In [281]:
distances=pairwise_distances(rval.cluster_centers_, np.array(ourCentroids), metric='euclidean')
np.amin(distances,axis=0)
#print distances
print np.amin(distances,axis=1)

[ 0.1631668   0.19877804  0.4426924 ]
