In [1]:
import pandas as pd

In [2]:
#function that creates a dataframe from a csv file with the column containing strings being indexed
def create_indexed_dataframe(csv_filename):
    initial_data = pd.read_csv(csv_filename)
    #iterate over columns to check for columns that contain anything other than numbers
    for col in initial_data.columns:
        #index the column that does not contain numbers
        if initial_data[col].dtype == object:
            indexed_data = initial_data.set_index(col)
            return indexed_data
    #if all columns contain numbers then simply return the original dataframe
    return(initial_data)

In [3]:
"""function that will create K new columns, where K = number of centroids,
these columns will contain the Manhattan distances between the indexed
seed centroid and the datapoints"""
def manhattan_distance_k_means(dataframe, *centroids):
    #set the new_row variable to 0
    new_row = 0
    #the num_cols is equal to number of columns in the dataframe
    num_cols = len(dataframe.columns)
    """the name of the columns will be related to this string,
    the dist_col_num variable will be incremented in accordance
    to the distance column number"""
    string = "DistC1"
    dist_col_num = 1
    #iterate over the centroids
    for centroid in centroids:
        #create an empty list which will be used to populate the respective distance columns
        distance = []
        #iterate over each row and column in the dataframe
        for row_num in range(len(dataframe)):
            for i in range(num_cols):
                """the value of the new row for the new column will be incremented
                by the Manhattan distance between the various datapoints of the 
                centroids and the datapoints themselves"""
                new_row += (abs(centroid[i] - dataframe.iloc[row_num][i]))
            #append the final value of new_row to the distance list
            distance.append(new_row)
            #reset the value of new_row to 0
            new_row = 0
        """the name of the new column is the string variable and is populated
        by the distance list"""
        dataframe[string] = distance
        #decrement the string variable by one character
        string = string[:-1]
        #increment the dist_col_num variable
        dist_col_num += 1
        #concatenate the str of the dist_col_num to the string
        string += str(dist_col_num)
    return dataframe

In [4]:
"""function that will create K new columns, where K = number of centroids,
these columns will contain the Euclidean distances between the indexed
seed centroid and the datapoints"""
def euclidean_distance_k_means(dataframe, *centroids):
    #set the new_row variable to 0
    new_row = 0
    #the num_cols is equal to number of columns in the dataframe
    num_cols = len(dataframe.columns)
    """the name of the columns will be related to this string,
    the dist_col_num variable will be incremented in accordance
    to the distance column number"""
    string = "DistC1"
    dist_col_num = 1
    #iterate over the centroids
    for centroid in centroids:
        #create an empty list which will be used to populate the respective distance columns
        distance = []
        #iterate over each row and column in the dataframe
        for row_num in range(len(dataframe)):
            for i in range(num_cols):
                """the value of the new row for the new column will be incremented
                by the Euclidean distance between the various datapoints of the 
                centroids and the datapoints themselves"""
                new_row += ((centroid[i] - dataframe.iloc[row_num][i])**2)
            #get the square root of the new_row value, in accordance to the Euclidean formula
            new_row**=(1/2)
            #append the final value of new_row to the distance list
            distance.append(new_row)
            #reset the value of new_row to 0
            new_row = 0
        """the name of the new column is the string variable and is populated
        by the distance list"""
        dataframe[string] = distance
        #decrement the string variable by one character
        string = string[:-1]
        #increment the dist_col_num variable
        dist_col_num += 1
        #concatenate the str of the dist_col_num to the string
        string += str(dist_col_num)
    return dataframe

In [5]:
"""function that will create K new columns, where K = number of centroids,
these columns will contain the Euclidean distances between the indexed
seed centroid and the datapoints"""
def minkowski_distance_k_means(dataframe, p, *centroids):
    #set the new_row variable to 0
    new_row = 0
    #the num_cols is equal to number of columns in the dataframe
    num_cols = len(dataframe.columns)
    """the name of the columns will be related to this string,
    the dist_col_num variable will be incremented in accordance
    to the distance column number"""
    string = "DistC1"
    dist_col_num = 1
    #iterate over the centroids
    for centroid in centroids:
        #create an empty list which will be used to populate the respective distance columns
        distance = []
        #iterate over each row and column in the dataframe
        for row_num in range(len(dataframe)):
            for i in range(num_cols):
                """the value of the new row for the new column will be incremented
                by the Minkowski distance between the various datapoints of the 
                centroids and the datapoints themselves"""
                new_row += (abs(centroid[i] - dataframe.iloc[row_num][i])**p)
            #get the pth root of the new_row value, in accordance to the Minkowski formula
            new_row**=(1/p)
            #append the final value of new_row to the distance list
            distance.append(new_row)
            #reset the value of new_row to 0
            new_row = 0
        """the name of the new column is the string variable and is populated
        by the distance list"""
        dataframe[string] = distance
        #decrement the string variable by one character
        string = string[:-1]
        #increment the dist_col_num variable
        dist_col_num += 1
        #concatenate the str of the dist_col_num to the string
        string += str(dist_col_num)
    return dataframe

In [6]:
"""function that returns the new centroid, based on the 
average of the datapoints that belong to a particular cluster"""
def new_centroid(total, n):
    return total/n

In [7]:
#function that returns the new centroids
def next_centroids(dataframe, K):
    #create a dataframe that takes the values of last K columns
    distances = dataframe.iloc[:, -K:]
    #create a list that takes the minimum values of each row of the dataframe
    smallest = distances.min(axis=1).tolist()
    #create an empty dictionary
    new_centroids = {}
    for i in range(K):
        #create new variables and set them to 0
        row_num = 0
        points = 0
        n = 0
        #iterate over each value in the list
        for value in smallest:
            """if the value belongs to a particular column, add the row from
            the original dataframe to the points variable"""
            if value == distances.iloc[row_num, i]:
                points += dataframe.iloc[row_num, : (len(dataframe.columns) - K)]
                n += 1
            #increment the row_num variable    
            row_num+=1
        #add the value returned by the new_centroid function to the dictionary
        new_centroids["NewC" + str(i+1)] = new_centroid(points, n).tolist()
    #convert the values of the dictionary to a list    
    new_centroids = list(new_centroids.values())
    #return the list
    return new_centroids

In [8]:
#function that assigns points to clusters when algorithm converges
def clusters(dataframe, K):
    #create a dataframe that takes the values of the last K columns
    distances = dataframe.iloc[:, -K:]
    #create a list that takes the minimum values of each row of the dataframe
    smallest = distances.min(axis=1).tolist()
    #create a new column called "Clusters"
    dataframe["Clusters"] = ""
    for i in range(K):
        #create new variables and set them to 0
        row_num = 0
        for value in smallest:
            """if the value belongs to a particular column, assign the cluster
            to that row in the original dataframe"""
            if value == distances.iloc[row_num, i]:
                dataframe.iloc[row_num, len(dataframe.columns) - 1] = "C"+str(i+1)
            #increment the row_num variable
            row_num += 1
    return dataframe

In [9]:
"""function that displays the centroids of each iteration based on the K Means Clustering
algorithm using Manhattan distance, when the algorithm converges, it will display the 
final centroids and the datapoints to which they belong"""
def k_means_manhattan(csv_filename, K, *centroids):
    indexed_data = create_indexed_dataframe(csv_filename)
    print("The original data:")
    display(indexed_data)
    #if no centroid arguments are passed, take the first K rows of the dataframe
    if not centroids:
        centroids = [0]*K
        for i in range(K):
            centroids[i] = list(indexed_data.iloc[i, :])
    print("The seed centroids are:")
    for centroid in centroids:
        print(centroid)
    print("")
    print("The Distances:")
    manhattan_distance_k_means(indexed_data, *centroids)
    display(indexed_data)
    print("The new centroids are:")
    new_centroids = next_centroids(indexed_data, K)
    print(new_centroids)
    """in the case of having no centroid arguments, the centroids will be tuples,
    so in order to compare them to the new centroids, they must be converted 
    to lists"""
    if not isinstance(centroids, list):
        centroids = list(centroids)
    """if the new centroids are equal to the current centroids, the algorithm has
    converged and as such, the function can terminate"""
    if(centroids == new_centroids):
        print("The algorithm has converged")
        clusters(indexed_data, K)
        display(indexed_data)
        return
    #otherwise, set the centroids to be equal to the new ones and call the function again
    else:
        for i in range(K):
            centroids[i] = new_centroids[i]
    k_means_manhattan(csv_filename, K, *centroids)

In [10]:
k_means_manhattan('KMeans.csv', 2, [1, 0, 0], [0, 1, 1])

The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1, 0, 0]
[0, 1, 1]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,5,4
P2,0,1,2,4,1
P3,3,0,5,7,8
P4,4,1,3,7,6
P5,5,0,1,5,6


The new centroids are:
[[4.0, 0.0, 3.0], [1.6666666666666667, 1.3333333333333333, 2.6666666666666665]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[4.0, 0.0, 3.0]
[1.6666666666666667, 1.3333333333333333, 2.6666666666666665]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,5.0,1.666667
P2,0,1,2,6.0,2.666667
P3,3,0,5,3.0,5.0
P4,4,1,3,1.0,3.0
P5,5,0,1,3.0,6.333333


The new centroids are:
[[4.0, 0.3333333333333333, 3.0], [0.5, 1.5, 2.5]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[4.0, 0.3333333333333333, 3.0]
[0.5, 1.5, 2.5]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,4.666667,1.5
P2,0,1,2,5.666667,1.5
P3,3,0,5,3.333333,6.5
P4,4,1,3,0.666667,4.5
P5,5,0,1,3.333333,7.5


The new centroids are:
[[4.0, 0.3333333333333333, 3.0], [0.5, 1.5, 2.5]]
The algorithm has converged


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,Clusters
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,4.666667,1.5,C2
P2,0,1,2,5.666667,1.5,C2
P3,3,0,5,3.333333,6.5,C1
P4,4,1,3,0.666667,4.5,C1
P5,5,0,1,3.333333,7.5,C1


In [11]:
k_means_manhattan('KMeans.csv', 3)

The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1, 2, 3]
[0, 1, 2]
[3, 0, 5]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,0,3,6
P2,0,1,2,3,0,7
P3,3,0,5,6,7,0
P4,4,1,3,4,5,4
P5,5,0,1,8,7,6


The new centroids are:
[[2.5, 1.5, 3.0], [0.0, 1.0, 2.0], [4.0, 0.3333333333333333, 3.0]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[2.5, 1.5, 3.0]
[0.0, 1.0, 2.0]
[4.0, 0.3333333333333333, 3.0]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,2.0,3.0,4.666667
P2,0,1,2,4.0,0.0,5.666667
P3,3,0,5,4.0,7.0,3.333333
P4,4,1,3,2.0,5.0,0.666667
P5,5,0,1,6.0,7.0,3.333333


The new centroids are:
[[1.0, 2.0, 3.0], [0.0, 1.0, 2.0], [4.0, 0.3333333333333333, 3.0]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1.0, 2.0, 3.0]
[0.0, 1.0, 2.0]
[4.0, 0.3333333333333333, 3.0]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,0.0,3.0,4.666667
P2,0,1,2,3.0,0.0,5.666667
P3,3,0,5,6.0,7.0,3.333333
P4,4,1,3,4.0,5.0,0.666667
P5,5,0,1,8.0,7.0,3.333333


The new centroids are:
[[1.0, 2.0, 3.0], [0.0, 1.0, 2.0], [4.0, 0.3333333333333333, 3.0]]
The algorithm has converged


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3,Clusters
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P1,1,2,3,0.0,3.0,4.666667,C1
P2,0,1,2,3.0,0.0,5.666667,C2
P3,3,0,5,6.0,7.0,3.333333,C3
P4,4,1,3,4.0,5.0,0.666667,C3
P5,5,0,1,8.0,7.0,3.333333,C3


In [12]:
"""function that displays the centroids of each iteration based on the K Means Clustering
algorithm using Euclidean distance, when the algorithm converges, it will display the 
final centroids and the datapoints to which they belong"""
def k_means_euclidean(csv_filename, K, *centroids):
    indexed_data = create_indexed_dataframe(csv_filename)
    print("The original data:")
    display(indexed_data)
    #if no centroid arguments are passed, take the first K rows of the dataframe
    if not centroids:
        centroids = [0]*K
        for i in range(K):
            centroids[i] = list(indexed_data.iloc[i, :])
    print("The seed centroids are:")
    for centroid in centroids:
        print(centroid)
    print("")
    print("The Distances:")
    euclidean_distance_k_means(indexed_data, *centroids)
    display(indexed_data)
    print("The new centroids are:")
    new_centroids = next_centroids(indexed_data, K)
    print(new_centroids)
    """in the case of having no centroid arguments, the centroids will be tuples,
    so in order to compare them to the new centroids, they must be converted 
    to lists"""
    if not isinstance(centroids, list):
        centroids = list(centroids)
    """if the new centroids are equal to the current centroids, the algorithm has
    converged and as such, the function can terminate"""
    if(centroids == new_centroids):
        print("The algorithm has converged")
        clusters(indexed_data, K)
        display(indexed_data)
        return
    #otherwise, set the centroids to be equal to the new ones and call the function again
    else:
        for i in range(K):
            centroids[i] = new_centroids[i]
    k_means_euclidean(csv_filename, K, *centroids)

In [13]:
k_means_euclidean('KMeans.csv', 2, [1, 0, 0], [0, 1, 1])

The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1, 0, 0]
[0, 1, 1]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,3.605551,2.44949
P2,0,1,2,2.44949,1.0
P3,3,0,5,5.385165,5.09902
P4,4,1,3,4.358899,4.472136
P5,5,0,1,4.123106,5.09902


The new centroids are:
[[4.5, 0.5, 2.0], [1.3333333333333333, 1.0, 3.3333333333333335]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[4.5, 0.5, 2.0]
[1.3333333333333333, 1.0, 3.3333333333333335]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,3.937004,1.105542
P2,0,1,2,4.527693,1.885618
P3,3,0,5,3.391165,2.560382
P4,4,1,3,1.224745,2.687419
P5,5,0,1,1.224745,4.459696


The new centroids are:
[[4.5, 0.5, 2.0], [1.3333333333333333, 1.0, 3.3333333333333335]]
The algorithm has converged


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,Clusters
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,3.937004,1.105542,C2
P2,0,1,2,4.527693,1.885618,C2
P3,3,0,5,3.391165,2.560382,C2
P4,4,1,3,1.224745,2.687419,C1
P5,5,0,1,1.224745,4.459696,C1


In [14]:
k_means_euclidean('KMeans.csv', 2)

The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1, 2, 3]
[0, 1, 2]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,0.0,1.732051
P2,0,1,2,1.732051,0.0
P3,3,0,5,3.464102,4.358899
P4,4,1,3,3.162278,4.123106
P5,5,0,1,4.898979,5.196152


The new centroids are:
[[3.25, 0.75, 3.0], [0.0, 1.0, 2.0]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[3.25, 0.75, 3.0]
[0.0, 1.0, 2.0]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,2.573908,1.732051
P2,0,1,2,3.409545,0.0
P3,3,0,5,2.150581,4.358899
P4,4,1,3,0.790569,4.123106
P5,5,0,1,2.76134,5.196152


The new centroids are:
[[4.0, 0.3333333333333333, 3.0], [0.5, 1.5, 2.5]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[4.0, 0.3333333333333333, 3.0]
[0.5, 1.5, 2.5]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,3.431877,0.866025
P2,0,1,2,4.176655,0.866025
P3,3,0,5,2.260777,3.840573
P4,4,1,3,0.666667,3.570714
P5,5,0,1,2.260777,4.974937


The new centroids are:
[[4.0, 0.3333333333333333, 3.0], [0.5, 1.5, 2.5]]
The algorithm has converged


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,Clusters
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,3.431877,0.866025,C2
P2,0,1,2,4.176655,0.866025,C2
P3,3,0,5,2.260777,3.840573,C1
P4,4,1,3,0.666667,3.570714,C1
P5,5,0,1,2.260777,4.974937,C1


In [15]:
k_means_euclidean('KMeansAdvanced.csv', 2)

The original data:


Unnamed: 0,Height,Weight
0,185,72
1,170,56
2,168,60
3,179,68
4,182,72
5,188,77
6,180,71
7,180,70
8,183,84
9,180,88


The seed centroids are:
[185, 72]
[170, 56]

The Distances:


Unnamed: 0,Height,Weight,DistC1,DistC2
0,185,72,0.0,21.931712
1,170,56,21.931712,0.0
2,168,60,20.808652,4.472136
3,179,68,7.211103,15.0
4,182,72,3.0,20.0
5,188,77,5.830952,27.658633
6,180,71,5.09902,18.027756
7,180,70,5.385165,17.204651
8,183,84,12.165525,30.870698
9,180,88,16.763055,33.526109


The new centroids are:
[[181.4, 74.5], [169.0, 58.0]]
The original data:


Unnamed: 0,Height,Weight
0,185,72
1,170,56
2,168,60
3,179,68
4,182,72
5,188,77
6,180,71
7,180,70
8,183,84
9,180,88


The seed centroids are:
[181.4, 74.5]
[169.0, 58.0]

The Distances:


Unnamed: 0,Height,Weight,DistC1,DistC2
0,185,72,4.382921,21.260292
1,170,56,21.730393,2.236068
2,168,60,19.743607,2.236068
3,179,68,6.928925,14.142136
4,182,72,2.570992,19.104973
5,188,77,7.05762,26.870058
6,180,71,3.769615,17.029386
7,180,70,4.712749,16.278821
8,183,84,9.633795,29.529646
9,180,88,13.572398,31.953091


The new centroids are:
[[181.4, 74.5], [169.0, 58.0]]
The algorithm has converged


Unnamed: 0,Height,Weight,DistC1,DistC2,Clusters
0,185,72,4.382921,21.260292,C1
1,170,56,21.730393,2.236068,C2
2,168,60,19.743607,2.236068,C2
3,179,68,6.928925,14.142136,C1
4,182,72,2.570992,19.104973,C1
5,188,77,7.05762,26.870058,C1
6,180,71,3.769615,17.029386,C1
7,180,70,4.712749,16.278821,C1
8,183,84,9.633795,29.529646,C1
9,180,88,13.572398,31.953091,C1


In [16]:
"""function that displays the centroids of each iteration based on the K Means Clustering
algorithm using Minkowski distance, when the algorithm converges, it will display the 
final centroids and the datapoints to which they belong"""
def k_means_minkowski(csv_filename, K, p, *centroids):
    indexed_data = create_indexed_dataframe(csv_filename)
    print("The original data:")
    display(indexed_data)
    #if no centroid arguments are passed, take the first K rows of the dataframe
    if not centroids:
        centroids = [0]*K
        for i in range(K):
            centroids[i] = list(indexed_data.iloc[i, :])
    print("The seed centroids are:")
    for centroid in centroids:
        print(centroid)
    print("")
    print("The Distances:")
    minkowski_distance_k_means(indexed_data, p, *centroids)
    display(indexed_data)
    print("The new centroids are:")
    new_centroids = next_centroids(indexed_data, K)
    print(new_centroids)
    """in the case of having no centroid arguments, the centroids will be tuples,
    so in order to compare them to the new centroids, they must be converted 
    to lists"""
    if not isinstance(centroids, list):
        centroids = list(centroids)
    """if the new centroids are equal to the current centroids, the algorithm has
    converged and as such, the function can terminate"""
    if(centroids == new_centroids):
        print("The algorithm has converged")
        clusters(indexed_data, K)
        display(indexed_data)
        return
    #otherwise, set the centroids to be equal to the new ones and call the function again
    else:
        for i in range(K):
            centroids[i] = new_centroids[i]
    k_means_minkowski(csv_filename, K, p, *centroids)

In [17]:
k_means_minkowski('KMeans.csv', 2, 3, [1, 0, 0], [0, 1, 1])

The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1, 0, 0]
[0, 1, 1]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,3.271066,2.154435
P2,0,1,2,2.154435,1.0
P3,3,0,5,5.104469,4.514357
P4,4,1,3,3.802952,4.160168
P5,5,0,1,4.020726,5.013298


The new centroids are:
[[4.5, 0.5, 2.0], [1.3333333333333333, 1.0, 3.3333333333333335]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[4.5, 0.5, 2.0]
[1.3333333333333333, 1.0, 3.3333333333333335]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,3.615213,1.024106
P2,0,1,2,4.502057,1.679895
P3,3,0,5,3.1244,2.172895
P4,4,1,3,1.077217,2.668402
P5,5,0,1,1.077217,3.979057


The new centroids are:
[[4.5, 0.5, 2.0], [1.3333333333333333, 1.0, 3.3333333333333335]]
The algorithm has converged


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,Clusters
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,3.615213,1.024106,C2
P2,0,1,2,4.502057,1.679895,C2
P3,3,0,5,3.1244,2.172895,C2
P4,4,1,3,1.077217,2.668402,C1
P5,5,0,1,1.077217,3.979057,C1


In [18]:
k_means_minkowski('KMeans.csv', 3, 3)

The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1, 2, 3]
[0, 1, 2]
[3, 0, 5]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,0.0,1.44225,2.884499
P2,0,1,2,1.44225,0.0,3.802952
P3,3,0,5,2.884499,3.802952,0.0
P4,4,1,3,3.036589,4.020726,2.154435
P5,5,0,1,4.308869,5.026526,4.160168


The new centroids are:
[[1.0, 2.0, 3.0], [0.0, 1.0, 2.0], [4.0, 0.3333333333333333, 3.0]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1.0, 2.0, 3.0]
[0.0, 1.0, 2.0]
[4.0, 0.3333333333333333, 3.0]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,0.0,1.44225,3.162506
P2,0,1,2,1.44225,0.0,4.026826
P3,3,0,5,2.884499,3.802952,2.082933
P4,4,1,3,3.036589,4.020726,0.666667
P5,5,0,1,4.308869,5.026526,2.082933


The new centroids are:
[[1.0, 2.0, 3.0], [0.0, 1.0, 2.0], [4.0, 0.3333333333333333, 3.0]]
The algorithm has converged


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3,Clusters
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P1,1,2,3,0.0,1.44225,3.162506,C1
P2,0,1,2,1.44225,0.0,4.026826,C2
P3,3,0,5,2.884499,3.802952,2.082933,C3
P4,4,1,3,3.036589,4.020726,0.666667,C3
P5,5,0,1,4.308869,5.026526,2.082933,C3


In [19]:
k_means_minkowski('KMeans.csv', 2, 3, [1, 0, 0], [0, 1, 1])

The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1, 0, 0]
[0, 1, 1]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,3.271066,2.154435
P2,0,1,2,2.154435,1.0
P3,3,0,5,5.104469,4.514357
P4,4,1,3,3.802952,4.160168
P5,5,0,1,4.020726,5.013298


The new centroids are:
[[4.5, 0.5, 2.0], [1.3333333333333333, 1.0, 3.3333333333333335]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[4.5, 0.5, 2.0]
[1.3333333333333333, 1.0, 3.3333333333333335]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1,1,2,3,3.615213,1.024106
P2,0,1,2,4.502057,1.679895
P3,3,0,5,3.1244,2.172895
P4,4,1,3,1.077217,2.668402
P5,5,0,1,1.077217,3.979057


The new centroids are:
[[4.5, 0.5, 2.0], [1.3333333333333333, 1.0, 3.3333333333333335]]
The algorithm has converged


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,Clusters
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,3.615213,1.024106,C2
P2,0,1,2,4.502057,1.679895,C2
P3,3,0,5,3.1244,2.172895,C2
P4,4,1,3,1.077217,2.668402,C1
P5,5,0,1,1.077217,3.979057,C1


In [20]:
k_means_minkowski('KMeans.csv', 3, 3)

The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1, 2, 3]
[0, 1, 2]
[3, 0, 5]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,0.0,1.44225,2.884499
P2,0,1,2,1.44225,0.0,3.802952
P3,3,0,5,2.884499,3.802952,0.0
P4,4,1,3,3.036589,4.020726,2.154435
P5,5,0,1,4.308869,5.026526,4.160168


The new centroids are:
[[1.0, 2.0, 3.0], [0.0, 1.0, 2.0], [4.0, 0.3333333333333333, 3.0]]
The original data:


Unnamed: 0_level_0,D1,D2,D3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P1,1,2,3
P2,0,1,2
P3,3,0,5
P4,4,1,3
P5,5,0,1


The seed centroids are:
[1.0, 2.0, 3.0]
[0.0, 1.0, 2.0]
[4.0, 0.3333333333333333, 3.0]

The Distances:


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P1,1,2,3,0.0,1.44225,3.162506
P2,0,1,2,1.44225,0.0,4.026826
P3,3,0,5,2.884499,3.802952,2.082933
P4,4,1,3,3.036589,4.020726,0.666667
P5,5,0,1,4.308869,5.026526,2.082933


The new centroids are:
[[1.0, 2.0, 3.0], [0.0, 1.0, 2.0], [4.0, 0.3333333333333333, 3.0]]
The algorithm has converged


Unnamed: 0_level_0,D1,D2,D3,DistC1,DistC2,DistC3,Clusters
Point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P1,1,2,3,0.0,1.44225,3.162506,C1
P2,0,1,2,1.44225,0.0,4.026826,C2
P3,3,0,5,2.884499,3.802952,2.082933,C3
P4,4,1,3,3.036589,4.020726,0.666667,C3
P5,5,0,1,4.308869,5.026526,2.082933,C3


In [21]:
k_means_minkowski('KMeansAdvanced.csv', 2, 3)

The original data:


Unnamed: 0,Height,Weight
0,185,72
1,170,56
2,168,60
3,179,68
4,182,72
5,188,77
6,180,71
7,180,70
8,183,84
9,180,88


The seed centroids are:
[185, 72]
[170, 56]

The Distances:


Unnamed: 0,Height,Weight,DistC1,DistC2
0,185,72,0.0,19.549076
1,170,56,19.549076,0.0
2,168,60,18.796536,4.160168
3,179,68,6.542133,13.493824
4,182,72,3.0,17.991766
5,188,77,5.336803,24.712984
6,180,71,5.013298,16.355332
7,180,70,5.104469,15.527872
8,183,84,12.01849,28.904561
9,180,88,16.161132,32.322264


The new centroids are:
[[181.4, 74.5], [169.0, 58.0]]
The original data:


Unnamed: 0,Height,Weight
0,185,72
1,170,56
2,168,60
3,179,68
4,182,72
5,188,77
6,180,71
7,180,70
8,183,84
9,180,88


The seed centroids are:
[181.4, 74.5]
[169.0, 58.0]

The Distances:


Unnamed: 0,Height,Weight,DistC1,DistC2
0,185,72,3.963862,18.98244
1,170,56,19.84308,2.080084
2,168,60,17.603177,2.080084
3,179,68,6.607285,12.59921
4,182,72,2.511467,17.032234
5,188,77,6.717464,23.9385
6,180,71,3.573128,15.223325
7,180,70,4.544723,14.51643
8,183,84,9.515104,27.28818
9,180,88,13.505017,30.485077


The new centroids are:
[[181.4, 74.5], [169.0, 58.0]]
The algorithm has converged


Unnamed: 0,Height,Weight,DistC1,DistC2,Clusters
0,185,72,3.963862,18.98244,C1
1,170,56,19.84308,2.080084,C2
2,168,60,17.603177,2.080084,C2
3,179,68,6.607285,12.59921,C1
4,182,72,2.511467,17.032234,C1
5,188,77,6.717464,23.9385,C1
6,180,71,3.573128,15.223325,C1
7,180,70,4.544723,14.51643,C1
8,183,84,9.515104,27.28818,C1
9,180,88,13.505017,30.485077,C1
