In [71]:
from sklearn.datasets import make_regression
from sklearn.cross_validation import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.cluster import KMeans
import sys
import numpy as np
from numpy.linalg import norm 
import random
import math
import time 

def mykmeans(p,k,iter,centroids=None):
    y = np.zeros(len(p))
    if centroids == None:
        centroids = train_test_split(p.copy(), y, train_size=k, 
                                     random_state=10051999)[0]
    tol = 0.000001
    movement = tol + 1
    counter = 0
    err = 0
    while movement > tol and counter < iter:
        counter += 1
        
        C=[] # to hold samples near each centroid
        for i in range(k):
            C.append([])
        
        # for each sample
        for i in range(len(p)):
            
            # find nearest centroid
            nearest_centroid = 0
            for j in range(1,k):
                if norm(p[i]-centroids[j]) < \
                            norm(p[i]-centroids[nearest_centroid]):
                    nearest_centroid = j
                    
            C[nearest_centroid].append(i) # add sample's index to cluster
            
        movement = 0.0
        for i in range(k):
            cluster_sum = np.zeros(p[0].shape)
            for j in range(len(C[i])):
                cluster_sum += p[C[i][j]]
            movement += norm(centroids[i] - (cluster_sum / len(C[i])))
            centroids[i] = cluster_sum / len(C[i])
            
        err = 0
        for i in range(k):
            for j in range(len(C[i])):
                err += math.pow(norm(centroids[i]-p[C[i][j]]),2)
                
        if iter == counter:
            print 'WARNING: Ran out of iterations'
    return (centroids, err)


def mykmeans_pp(p,k,iter):
    centroids = []
    centroids.append(p[random.randint(0, len(p)-1)])

    for centroid_counter in range(1,k):
        # construct probability distribution space
        denomin = 0.0
        for i in range(len(p)):
            denomin += math.pow(norm(centroids[0]-p[i]),2)

        prob_range = 0.0
        for i in range(len(p)):
            prob_range += math.pow(norm(centroids[0]-p[i]),2)/denomin

        # make random draw
        random_point = random.uniform(0, prob_range)

        # map back to sample
        prob_range = 0.0
        for i in range(len(p)):
            prob_range += math.pow(norm(centroids[0]-p[i]),2)/denomin
            if random_point < prob_range:
                centroids.append(p[i])
                break

    return mykmeans(p,k,iter,np.asarray(centroids))


def mykmeans_multi(p,k,iter,rep,method=None):
    runs = []
    errs = []
    for r in range(rep):
        if method == '++':
            run = mykmeans_pp(p,k,iter)
        else:
            run = mykmeans(p,k,iter)
        runs.append(run[0])
        errs.append(run[1])
    return (runs[np.argmin(errs)], errs[np.argmin(errs)])


# Make up data
X, y, true_coefficient = make_regression(n_samples=80, n_features=3, 
                                         n_informative=3, noise=10, 
                                         coef=True, random_state=20140210)


start = time.time()
print mykmeans(X.copy(),5,50)
print
end = time.time()
print 'Time:', end - start
print

start = time.time()
print mykmeans_multi(X.copy(),5,50,100)
print
end = time.time()
print 'Time:', end - start
print

start = time.time()
print mykmeans_pp(X.copy(),5,50)
print
end = time.time()
print 'Time:', end - start
print

# base line
start = time.time()
km = KMeans(n_clusters=5)
km.fit(X.copy())
print km.cluster_centers_
print
end = time.time()
print 'Time:', end - start



(array([[ 0.09528809, -0.83259786,  0.82130347],
       [-1.01973921, -0.46027844, -0.04408722],
       [-0.49424394, -0.23018475, -1.34050889],
       [-0.38944761,  1.05612647, -0.14326015],
       [ 1.31343172,  0.38054322,  0.41188043]]), 102.28056575247705)

Time: 0.0254218578339

(array([[ 0.09528809, -0.83259786,  0.82130347],
       [-1.01973921, -0.46027844, -0.04408722],
       [-0.49424394, -0.23018475, -1.34050889],
       [-0.38944761,  1.05612647, -0.14326015],
       [ 1.31343172,  0.38054322,  0.41188043]]), 102.28056575247705)

Time: 2.49410009384

(array([[-0.56856595,  0.841645  , -0.86473581],
       [-0.73251169, -0.69345415, -0.29856372],
       [ 0.06625869,  0.21004775,  2.29757081],
       [ 0.84793156,  0.24965271,  0.29491189],
       [ 0.16233406, -1.63615348,  0.63218236]]), 95.14240345029782)

Time: 0.0523109436035

[[ 1.18385811  0.42936766  0.40548184]
 [-0.36063787  0.91685031 -0.75378446]
 [ 0.26827519 -1.73544242  0.57601606]
 [-0.3138092  -0.20149768

