<a href="https://colab.research.google.com/github/mohak1/machine-learning-labs/blob/main/ml_practical_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###1. K Means Clustering

In [None]:
#using all the features(columns) of the dataset
from sklearn.datasets import load_iris
import numpy as np

np.random.seed(1)

class KMeansClustering:
  def __init__(self, data, target, clusters=3):
    data = data/np.max(data)    #because the random coordinates are b/w 0-1
    self.data = data
    self.target = target
    self.k = clusters

  def init_cluster_centers(self):
    '''
    this method randomly initialises k points as cluster centers
    '''
    #the dims of cluster points is the same as the dims of the data
    dims = len(self.data[0])
    #k random nos, each with dimensions = dims of data
    self.centers = np.random.random((self.k,dims))
    
  def update_cluster_centers(self):
    '''
    self.dist is a k*no_of_data list, transpose if it is no_of_data*k
    taking the transpose, value of (ith row, jth col) would be distance of ith data from jth cluster point
    taking an argmin on each row the nearest cluster point of each data can be determined 
    
    clustered_point: a dictionary of {cluster_point: [cluster_elements]}
    this list would help in updating the cluster center
    '''
    self.centers_dict = {}
    self.clustered_points = {}
    #key: value, key=cluster center, values=cluster members
    
    for i in range(self.k):
      #a dictionary to keep track of which no. corresponds to which center point
      self.centers_dict[i] = self.centers[i]
      #init the points dict with empty lists to append with data points later
      self.clustered_points[i] = []


    self.dist_transpose = np.transpose(self.dist)
    for i in range(len(self.data)):
      #cluster count starts with 0
      nearest_cluster = np.argmin(self.dist_transpose[i])
      self.clustered_points[nearest_cluster].append(self.data[i])

    #update the cluster centers
    for i in range(self.k):
      #get the center point
      cluster_point = self.centers_dict[i]
      #take the mean and update the point to cluster mean
      if(len(self.clustered_points[i])>0):
        cluster_mean = np.mean(self.clustered_points[i], axis=0)
        #update the centers
        self.centers[i] = cluster_mean
      else:
        #empty list, dont update
        continue
    print('\ncenters after an update: \n',self.centers)  

      
  def distance(self):
    '''
    this method calculates the Euclidean distance of all the points with all the
    other data points
    '''
    self.dist = []
    column=0
    for point in self.centers:
      #calculate the euclidean dist of each point from the dataset
      euc_dist = np.sqrt(np.sum((self.data - point)**2, axis=1))
      #list containing k lists. ith list in dist coorsponds to ith center
      #jth element of ith list is the distance of ith cluster point from jth data
      self.dist.append(euc_dist)
    #print(self.dist,'\n')

  def fit(self):
    #run the update until the centers stop changing
    self.init_cluster_centers()
    print('\ninitial cluster centers = \n', self.centers)
    
    while(True):
      #centers before the update
      before_update = self.centers
      self.distance()
      self.update_cluster_centers()
      #centers after the update
      after_update = self.centers
      
      #check for convergence
      if(np.all(before_update==after_update)):
        break
    
    print('\ncluster centers after the KNN convergence = \n', self.centers)

In [None]:
#runner code
dataset = load_iris()
data = dataset.data
target = dataset.target

obj = KMeansClustering(data, target, clusters=5)
obj.fit()


initial cluster centers = 
 [[0.90337952 0.57367949 0.00287033 0.61714491]
 [0.3266449  0.5270581  0.8859421  0.35726976]
 [0.90853515 0.62336012 0.01582124 0.92943723]
 [0.69089692 0.99732285 0.17234051 0.13713575]
 [0.93259546 0.69681816 0.06600017 0.75546305]]

centers after an update: 
 [[0.90337952 0.57367949 0.00287033 0.61714491]
 [0.79265823 0.3635443  0.62101266 0.2121519 ]
 [0.90853515 0.62336012 0.01582124 0.92943723]
 [0.63367089 0.43392405 0.18506329 0.03113924]
 [0.93259546 0.69681816 0.06600017 0.75546305]]

cluster centers after the KNN convergence = 
 [[0.90337952 0.57367949 0.00287033 0.61714491]
 [0.79265823 0.3635443  0.62101266 0.2121519 ]
 [0.90853515 0.62336012 0.01582124 0.92943723]
 [0.63367089 0.43392405 0.18506329 0.03113924]
 [0.93259546 0.69681816 0.06600017 0.75546305]]
