In [280]:
import numpy as np
from matplotlib import pyplot as plt
import random
from collections import defaultdict

#Read coordinate csv.
#height,tail length,leg length,nose circumference
def read_coords(filename):    
    myFile = open(filename)
    row =0
    coords =[]
    for line in myFile:
        coords.append(line.rstrip().split(",")[:])
        #coords[row] = line.rstrip().split(",")[:]
    myFile.close()
    return coords

def compute_euclidean_distance(centroid, obj):
    dist = 0
    for i in range(len(centroid)):
        dist += (obj[i] - centroid[i])**2
    return dist

#Initialises centroids based on min/max values of datapoints
def initialise_centroids(dataset, k):
    min_x = np.amin(dataset[:,0])
    max_x = np.amax(dataset[:,0])
    min_y = np.amin(dataset[:,1])
    max_y = np.amax(dataset[:,1])
    
    centroids = []
    for i in range(1, k+1):
        coords = []
        #Randomise between min and max values for both columns
        x = np.random.uniform(min_x, max_x)
        y = np.random.uniform(min_y, max_y)
        #Add to array, convert
        coords.append(x)
        coords.append(y)
        coords = np.asarray(coords)
        centroids.append(coords)
    centroids = np.asarray(centroids)
    return centroids

def find_nearest_centroid(centroids, point):
    closest = None
    winner = 50000000000000000
    #For each centroid compute distance to point, if value is smaller than
    #the previous smallest, then replace and carry on until end
    for center in centroids:
        val = compute_euclidean_distance(center, point)
        if val < winner:
            winner = val
            closest = center
    return closest

#compare parameters of vector 
def compare(p1, p2):
    #if components are the same return true
    if(p1[0] == p2[0]) and (p1[1] == p2[1]):
        return True
    else:
        return False
#take 2d array and search for point in it
#if found, remove, and return array + boolean
def searchFor(point, arr):
    for i in range(len(arr)):
        if compare(point, arr[i]) == True:
            #arr = arr.pop(i)
            del arr[i]
            return arr, True
    #pass mutated array back further up
    return arr, False
#look for value in clusters
def deleteFromCluster(point, clusters):
    for key in clusters:
        arr, found = searchFor(point, clusters[key])
        #if found, then replace contents
        if(found == True):
            #print(type(arr))
            clusters[key] = arr
    return clusters

def kmeans(dataset, k):
    #Init centroids
    centroids = initialise_centroids(dataset, k)
    #define clusters
    clusters = {}
    for cent in centroids:
        clusters[np.array2string(cent)] = []
    
    #For each item in the set
    for item in dataset:
        #Find which centroid is nearest
        closest = find_nearest_centroid(centroids, item)
        #Assign to cluster
        clusters = deleteFromCluster(item, clusters)
        key = np.array2string(closest)
        temp = clusters.get(key)
        #print(type(temp))
        temp.append(item)
        
            
    return centroids, clusters

#Read in data
data = read_coords("ML_task2_test.csv")
data = np.asarray(data)
#Get height & tail length, join as tuples and convert
height_tail = list(zip(data[:,0], data[:,1]))
height_tail = np.asarray(height_tail)
height_tail = height_tail.astype(float)

centroids, clusters = kmeans(height_tail, 2)
print(clusters)

{'[4.69697288 3.02425738]': [array([4.949, 3.03 ]), array([4.646, 3.131]), array([4.646, 3.434]), array([4.444, 2.929]), array([4.949, 3.131]), array([4.848, 3.03 ]), array([4.343, 3.03 ]), array([5.151, 3.535]), array([5.454, 3.434]), array([5.151, 3.333]), array([4.848, 3.434]), array([5.05, 3.03]), array([5.05 , 3.434]), array([5.252, 3.535]), array([5.252, 3.434]), array([4.747, 3.232])], '[4.5864944  4.03635245]': [array([5.05 , 3.636]), array([5.454, 3.737]), array([5.858, 4.04 ]), array([5.757, 4.444]), array([5.454, 3.939]), array([5.757, 3.838]), array([5.151, 3.838]), array([5.151, 3.737]), array([4.646, 3.636])]}
