Clustering
===

Homogeneity
---
---

Homogeneity is a measure of the ratio of samples of a single class pertaining to a single  
cluster. The fewer different classes included in one cluster, the better. The lower bound  
should be 0.0 and the upper bound should be 1.0 (higher is better), and the formulation for  
it is expressed as follows:

Completeness
---
---

V-measure
---
---

In [46]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd 

style.use('ggplot')

class K_Means:
    def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
        
        # We start with 3 clusters
        self.k = k
        
        # The tolerance is set to '0.0001', this means that, if the difference between the new and the old centroids is less than '0.0001', it will stop iterating
        self.tolerance = tolerance
        
        # It will iterate 500 times, if the tolerance doesn't get lower than '0.0001'
        self.max_iterations = max_iterations

    def fit(self, data):

        self.centroids = {}

        #initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
        for i in range(self.k):
            self.centroids[i] = data[i]
            
        #begin iterations
        for i in range(self.max_iterations):
            self.classes = {}
            for i in range(self.k):
                self.classes[i] = []
 
            #find the distance between the point and cluster; choose the nearest centroid
            for features in data:
                distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
                print(features)
                print(self.centroids[centroid])
                classification = distances.index(min(distances))
                self.classes[classification].append(features)

            previous = dict(self.centroids)

            #average the cluster datapoints to re-calculate the centroids
            for classification in self.classes:
                self.centroids[classification] = np.average(self.classes[classification], axis = 0)

            isOptimal = True

            for centroid in self.centroids:

                original_centroid = previous[centroid]
                curr = self.centroids[centroid]

                if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
                    isOptimal = False

            #break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
            if isOptimal:
                break

    def pred(self, data):
        distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification

In [47]:
def main():

    df = pd.read_csv("datasets/ipl.csv")
    df = df[['one', 'two']]
    dataset = df.astype(float).values.tolist()

    X = df.values #returns a numpy array
    km = K_Means(3)
    km.fit(X)

    # Plotting starts here
    colors = 10*["r", "g", "c", "b", "k"]

    for centroid in km.centroids:
        plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")

    for classification in km.classes:
        color = colors[classification]
        for features in km.classes[classification]:
            plt.scatter(features[0], features[1], color = color,s = 30)
            
    plt.show()

In [48]:
main()

[0.22767982 0.85820415]


UnboundLocalError: local variable 'centroid' referenced before assignment

In [50]:
import numpy as np
import os

def compute_euclidean_distance(point, centroid):
    return np.sqrt(np.sum((point - centroid)**2))

def assign_label_cluster(distance, data_point, centroids):
    index_of_minimum = min(distance, key=distance.get)
    return [index_of_minimum, data_point, centroids[index_of_minimum]]

def compute_new_centroids(cluster_label, centroids):
    return np.array(cluster_label + centroids)/2

def iterate_k_means(data_points, centroids, total_iteration):
    label = []
    cluster_label = []
    total_points = len(data_points)
    k = len(centroids)
    
    for iteration in range(0, total_iteration):
        for index_point in range(0, total_points):
            distance = {}
            for index_centroid in range(0, k):
                distance[index_centroid] = compute_euclidean_distance(data_points[index_point], centroids[index_centroid])
            label = assign_label_cluster(distance, data_points[index_point], centroids)
            centroids[label[0]] = compute_new_centroids(label[1], centroids[label[0]])

            if iteration == (total_iteration - 1):
                cluster_label.append(label)

    return [cluster_label, centroids]

def print_label_data(result):
    print("Result of k-Means Clustering: \n")
    for data in result[0]:
        print("data point: {}".format(data[1]))
        print("cluster number: {} \n".format(data[0]))
    print("Last centroids position: \n {}".format(result[1]))

def create_centroids():
    centroids = []
    centroids.append([5.0, 0.0])
    centroids.append([45.0, 70.0])
    centroids.append([50.0, 90.0])
    return np.array(centroids)

if __name__ == "__main__":
    filename = 'https://raw.githubusercontent.com/corvasto/Simple-k-Means-Clustering-Python/master/data.csv'
    data_points = np.genfromtxt(filename, delimiter=",")
    centroids = create_centroids()
    total_iteration = 100
    
    [cluster_label, new_centroids] = iterate_k_means(data_points, centroids, total_iteration)
    print_label_data([cluster_label, new_centroids])
    print()

Result of k-Means Clustering: 

data point: [15. 16.]
cluster number: 0 

data point: [16.  18.5]
cluster number: 0 

data point: [17.  20.2]
cluster number: 0 

data point: [16.4  17.12]
cluster number: 0 

data point: [17.23 18.12]
cluster number: 0 

data point: [43. 43.]
cluster number: 1 

data point: [44.43  45.212]
cluster number: 1 

data point: [45.8  54.23]
cluster number: 1 

data point: [46.313 43.123]
cluster number: 1 

data point: [50.21 46.3 ]
cluster number: 1 

data point: [99.   99.22]
cluster number: 2 

data point: [100.32   98.123]
cluster number: 2 

data point: [100.32   97.423]
cluster number: 2 

data point: [102.    93.23]
cluster number: 2 

data point: [102.23  94.23]
cluster number: 2 

Last centroids position: 
 [[ 16.83483871  18.08645161]
 [ 48.02980645  46.32670968]
 [101.69677419  94.79606452]]

