In [1]:
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import functools # import function tools working with reduce()
from ipynb.fs.full.vector import *


class KMeans:
    '''peforms k-means clustering'''
    def __init__(self, k):
        self.k = k # number of cluser
        self.means = None # mean of cluster 
        
    def classify(self, input):
        '''return the index of the cluster closest to the input'''
        return min(range(self.k), key = lambda i: squared_distance(input, self.means[i]))
    
    def train(self, inputs):
        # choose k random points as the initial means
        self.means = random.sample(inputs, self.k)
        assignments = None
        
        while True:
            # find assignments
            new_assignments = map(self.classify, inputs)
            # if no assignments have changed we are done
            if assignments == new_assignments:
                return
            
            # otherwise keep the new assignment
            assignments = new_assignments
            
            # compute new mean based on the new assingnments
            for i in range(self.k):
                # find all the points assigned to cluster i
                i_points = [p for p , a in zip(inputs, assignments) if a == i]
                
                # make sure i_points is not empty
                if i_points:
                    self.means[i] = vector_mean(i_points)                   

def squared_clustering_errors(inputs, k):
    clusterer = KMeans(k)
    clusterer.train(inputs)
    means = clusterer.means
    assignments = map(clusterer.classify, inputs)
    return sum(squared_distance(input, means[cluster]) for input, cluster in zip(inputs, assignments))


[ 85  75 130]
Scalar product:  [400 500 600]
Vector mean:  [28.33333333 25.         43.33333333]
Dot product:  3200
Sum of square:  7700
Magnitude or length:  87.74964387392122
Distance between two vector:  51.96152422706632


In [None]:
random.seed(0)
"""inputs = ([-44, 5], [-16, 10], [18, 20], [-50, 0], [-45, 5], [-40, 7], [-35, -1], [-27, -10], [-22, -18], [-20, -15],
         [-18, -7], [-12, -20], [-9, -19], [-11, -10], [-10, -8], [-12, -8], [10, 10], [10, 12], [20, 20], [20, 18],
         [25, 25], [20, 25], [28, 10],)"""
clusterer = KMeans(3)
#clusterer.train(inputs)
print(clusterer.means)

In [None]:

ks = range(1, len(inputs)+1)
errors = [squared_clustering_errors(inputs, k) for k in ks]
plt.plot(ks, errors)
plt.xtricks(ks)
plt.title("Total error vs. # of clusters")
plt.ylabel("Total squared error")
plt.xlabel("K")