In [1]:
import pandas as pd
import numpy as np
import random

import warnings
warnings.filterwarnings('ignore')

# K-Means

In [2]:
class KMeans:
    
    def __init__(self, k, scaling=False, random_state=-1):
        
        # scalar : #centroids
        self.k = k
        self.random_state = random_state
        self.scaling = scaling
        
    def standardScaler(self, data):
        
        # shape(#features, )
        data_mean = data.mean(axis=0)
        data_std = data.std(axis=0)

        # shape(#samples, #features)
        scaled_data = (data - data_mean) / data_std

        return scaled_data
        
    def centroidInit(self, data):
        
        if self.random_state != -1:
            np.random.seed(self.random_state)
            
        # shape(#samples, #features)
        centroids = data.copy()
        np.random.shuffle(centroids)
        
        # shape(#centroids, #features)
        return centroids[:self.k]
        
            
    def clusterAssignment(self, data):
        
        # shape(#samples, )
        self.clusters = np.argmin(np.sqrt(((data - self.centroids[:, np.newaxis])**2).sum(axis=2)), axis=0)
        
    def centroidUpdate(self, data):
        
        # shape(#centroids, #features)
        self.old_centroids = self.centroids.copy()
        self.centroids = np.array([data[self.clusters==i].mean(axis=0) for i in range(self.k)])
        
    def fitTransform(self, data):
        
        if self.scaling:
            # shape(#samples, #features)
            data = self.standardScaler(data)
        
        # shape(#centroids, #features)
        self.centroids = self.centroidInit(data)
        
        while True:
            
            self.clusterAssignment(data)
            self.centroidUpdate(data)
            if np.array_equal(self.old_centroids, self.centroids):
                break

### Data Preparation

In [3]:
df = pd.read_csv('data/Iris.csv')
data = df.drop(['Species'], axis=1).values

In [4]:
data.shape

(150, 4)

## Train

#### Without Scaling

In [5]:
km = KMeans(3, random_state=0)
km.fitTransform(data)
km.clusters

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1])

#### With Scaling

In [6]:
km = KMeans(3, scaling=True, random_state=0)
km.fitTransform(data)
km.clusters

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0])