In [5]:
import numpy as np
from numpy import random
import pandas as pd
from sklearn.neighbors import DistanceMetric
import functools
import operator
import matplotlib.pyplot as plt
import random
from scipy.spatial import distance

In [6]:
def simulate_data_5(N):
    X1 = np.random.uniform(low=0, high=25, size=(N))
    X2 = np.random.uniform(low=0, high=25, size=(N))
    X3 = np.random.uniform(low=0, high=25, size=(N))
    X4 = np.random.uniform(low=0, high=25, size=(N))
    X5 = np.random.uniform(low=0, high=25, size=(N))
    return X1, X2, X3, X4, X5 

In [7]:
from scipy.spatial import distance
def compute_dist(df):
    result = []
    for i in range(len(df)):
        for j in range(i+1,len(df)):
            d = distance.euclidean(df.iloc[i].to_list(),df.iloc[j].to_list())
            result.append([i, j, d])

    ordered_result = sorted(result, key=lambda t: t[::-1])
    return ordered_result

In [166]:
N = 215
X1, X2, X3, X4, X5 = simulate_data_5(N)
df = pd.DataFrame({'X1': X1, 'X2': X2,'X3': X3, 'X4': X4, 'X5': X5}, columns=['X1', 'X2','X3', 'X4', 'X5'])
#ordered_result = compute_dist(df)

In [167]:
# Randomly assign indeces of cluster centroids:
def initiate_centroids(n, df):
    centroids = []
    random.seed(42) #for reproducability  
    #generate random centroid indeces
    initial_index_centroid = random.sample(range(0, len(df)), n)
    #find the data points corresponding the the indeces:
    for i in initial_index_centroid:
        centroids.append(df.loc[i])
    return np.array(centroids)

# To find the closest centroid to each data point:
def findClosestCentroids(centroids, df):
    assigned_centroid = []
    arr = df.to_numpy()
    #iterate over every data point in the dataframe:
    for row in arr:
        distance=[]
        #find distance of data point with each cluster:
        for center in centroids:          
            distance.append(np.linalg.norm(row-center))     
        #assign data point to closest cluster:
        assigned_centroid.append(np.argmin(distance))
    return assigned_centroid


#To update the centroid of the clusters:
def calc_centroids(clusters, df): 
    #initiate empty list for new centroids of each cluster:
    new_centroids = []
    #df including each point and its respective cluster 
    arr = np.c_[df.to_numpy(), clusters]
    #iterate over the distinct clusters
    for c in np.unique(clusters):    
        #take out the data points corresponding to each cluster:
        current_arr = arr[arr[:, -1] == c][:,:-1]   
        #find the new cluster centroid which is the mean of the clusters we already assigned
        cluster_mean = current_arr.mean(axis=0)                                          
        #append the new centroid
        new_centroids.append(cluster_mean)   
    return np.vstack(new_centroids)

#Recursively find and update cluster centroids:
#n: number of clusters, df: dataframe of data points, iterations: number of iterations
def recursive_centroid_find(n, df):
    #initiate centroids:
    old_centroids = initiate_centroids(n, df)
    #initiate new centroids (do this for first while loop condition to work)
    new_centroids = calc_centroids(findClosestCentroids(old_centroids, df), df)
    # Iterate until the absoulte difference between the coordinates of all centroids 
    #does not change, ie: the k-means algorithm converges:
    while np.absolute(np.array(new_centroids) - np.array(old_centroids)).all() > 0.0005:
        #set new centroids as the old ones:
        old_centroids = new_centroids
        #find the new ones:
        new_centroids = calc_centroids(findClosestCentroids(old_centroids, df), df)
        #print(np.array(new_centroids))
    return findClosestCentroids(new_centroids, df)

In [169]:
%%time
recursive_centroid_find(100, df)

CPU times: user 256 ms, sys: 3.91 ms, total: 260 ms
Wall time: 286 ms


[73,
 35,
 51,
 42,
 90,
 20,
 2,
 17,
 16,
 42,
 43,
 51,
 85,
 95,
 87,
 48,
 12,
 62,
 78,
 89,
 56,
 10,
 13,
 18,
 46,
 68,
 9,
 96,
 1,
 88,
 66,
 54,
 82,
 15,
 94,
 7,
 54,
 71,
 85,
 40,
 36,
 73,
 95,
 79,
 85,
 2,
 29,
 72,
 49,
 61,
 25,
 93,
 49,
 76,
 34,
 19,
 31,
 6,
 64,
 20,
 56,
 22,
 5,
 91,
 20,
 76,
 81,
 50,
 77,
 84,
 4,
 34,
 98,
 24,
 97,
 58,
 70,
 84,
 86,
 89,
 91,
 47,
 30,
 86,
 31,
 72,
 42,
 38,
 48,
 68,
 75,
 47,
 59,
 72,
 74,
 70,
 55,
 45,
 2,
 87,
 94,
 56,
 37,
 91,
 30,
 89,
 3,
 30,
 15,
 7,
 51,
 50,
 99,
 39,
 32,
 64,
 71,
 52,
 82,
 68,
 68,
 35,
 69,
 54,
 28,
 84,
 48,
 30,
 49,
 21,
 6,
 67,
 99,
 4,
 4,
 65,
 10,
 53,
 25,
 12,
 18,
 57,
 65,
 24,
 98,
 83,
 32,
 60,
 12,
 9,
 33,
 14,
 54,
 29,
 22,
 90,
 51,
 81,
 66,
 19,
 65,
 6,
 8,
 0,
 42,
 6,
 27,
 94,
 45,
 69,
 36,
 76,
 5,
 10,
 33,
 70,
 11,
 89,
 92,
 28,
 39,
 73,
 79,
 26,
 11,
 79,
 90,
 15,
 8,
 14,
 97,
 90,
 49,
 10,
 67,
 41,
 44,
 91,
 95,
 37,
 71,
 87,
 29,
 26,
 