In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing necessary libraries


In [None]:
import numpy as np
import pandas as pd
import tracemalloc
from copy import deepcopy
import time
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from scipy.stats import mode
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

## Algorithm Directory path

In [None]:
# All the results are stored in this directory
directoryPath = '/content/drive/MyDrive/Colab Notebooks/Data Mining Assignments/Clustering/'

## K Medoids Clustering

In [None]:
class KMedoidsClustering:

  def __init__(self, X, Y, K, dataType="numerical", attributeTypes = [], ranks = {}, numberOfIterations = 5, randomState = 24, symmetricity = {}):
    self.X = X
    self.Y = Y
    self.K = K
    self.representativeObjects = {}
    self.representativeObjectsIdx = np.array([])
    self.clusterLabels = None
    self.dataType = dataType
    self.attributeInfo = self.get_attribute_info(deepcopy(attributeTypes), ranks, symmetricity)
    self.randomState = randomState
    self.numberOfIterations = numberOfIterations
    self.absoluteErrorCriterion = 0.0
    self.withinClusterVariance = 0.0
    self.bCubedPrecision = 0.0
    self.bCubedRecall = 0.0
    self.silhouetteCoefficient = 0.0 
    
    if self.dataType == "mixed":
      assert len(attributeTypes) == self.X.shape[1], "If dataset type is mixed, then attributeType for each attribute needs to be passed."

  def get_withinClusterVariance(self):
    return self.withinClusterVariance

  def get_BCubedPrecision(self):
    return self.bCubedPrecision

  def get_BCubedRecall(self):
    return self.bCubedRecall

  def get_silhouetteCoefficient(self):
    return self.silhouetteCoefficient

  def train(self):
    minn = np.inf
    XNumerical, XNominal, XOrdinal = self.preprocess_X()
    for i in range(self.numberOfIterations):
      # print("Iteration no {}".format(i))
      # print("____________________________________________\n")
      representativeObjectsIdx, absoluteErrorCriterion, clusterLabels, withinClusterVariance = self.trainOnce(XNumerical, XNominal, XOrdinal, i)
      # print("\nIteration : {} and absoluteErrorCriterion : {}\n".format(i, absoluteErrorCriterion))

      if absoluteErrorCriterion < minn:
        minn = absoluteErrorCriterion
        self.representativeObjectsIdx = representativeObjectsIdx
        self.clusterLabels = clusterLabels
        self.withinClusterVariance = withinClusterVariance
        self.absoluteErrorCriterion = minn
    
    self.silhouetteCoefficient = self.calculate_silhouette_coefficient()
    # print("\n withinClusterVariance : {}\n".format(self.withinClusterVariance))
    # print("\n silhouetteCoefficient : {}\n".format(self.silhouetteCoefficient))
    if len(self.Y):
      self.bCubedPrecision = self.calculate_bcubed_precision()
      self.bCubedRecall = self.calculate_bcubed_recall()
      # print("\n bCubedPrecision : {}\n".format(self.bCubedPrecision))
      # print("\n bCubedRecall : {}\n".format(self.bCubedRecall))
    
  def calculate_distances_from_representativeObjects(self, XNumerical, XOrdinal, XNominal, representativeObjectsIdx):
    ## eliminating square root from the euclidean distance calculation
    ## that means pairwise squared distances from cluster centers and the samples 
    pairwiseDistancesNumerical = 0 if len(XNumerical) == 0 else cdist(XNumerical, XNumerical[representativeObjectsIdx, :], metric='euclidean') ** 2
    pairwiseDistancesOrdinal = 0 if len(XOrdinal) == 0 else cdist(XOrdinal, XOrdinal[representativeObjectsIdx, :], metric='euclidean') ** 2  
    # hamming distance for nominal attributes
    pairwiseDistancesNominal = 0 if len(XNominal) == 0 else self.calculate_hamming_distance(XNominal, XNominal[representativeObjectsIdx, :]) 

    pairwiseDistances = pairwiseDistancesNumerical + pairwiseDistancesNominal + pairwiseDistancesOrdinal
    return np.sqrt(pairwiseDistances)

  def preprocess_X(self):
    XNumerical, XNominal, XOrdinal = np.array([]), np.array([]), np.array([])
    if self.dataType == 'numerical':
      XNumerical = self.X.astype(float)
      XNumerical = self.normalize_numerical_values(XNumerical)
    
    elif self.dataType == 'nominal':
      XNominal = self.X
    
    elif self.dataType == 'ordinal':
      XOrdinal = self.X
      XOrdinal = self.format_ordinal_values(XOrdinal)

    elif self.dataType == 'mixed':
      XNumerical, XNominal, XOrdinal, _, _ = self.seperate_mixed_data()
      XNumerical = self.normalize_numerical_values(XNumerical) if len(XNumerical) else np.array([])
      XOrdinal = self.format_ordinal_values(XOrdinal) if len(XOrdinal) else np.array([])
    
    return XNumerical, XNominal, XOrdinal

  def trainOnce(self, XNumerical, XNominal, XOrdinal, iterationNum = 0):

    np.random.seed(self.randomState + iterationNum)
    allObjectsIdx = np.arange(self.X.shape[0])
    representativeObjectsIdx = np.random.choice(self.X.shape[0], self.K, replace=False)
    # print("representativeObjectsIdx : {}".format(representativeObjectsIdx))

    mask = np.ones(self.X.shape[0]).astype(bool)
    mask[representativeObjectsIdx] = False
    nonRepresentativeObjectsIdx = allObjectsIdx[mask]

    pairwiseDistances = self.calculate_distances_from_representativeObjects(XNumerical, XOrdinal, XNominal, representativeObjectsIdx)
    absoluteErrorCriterion = np.sum(np.min(pairwiseDistances, axis = 1))
    withinClusterVariance = np.sum(np.min(pairwiseDistances**2, axis = 1))
    # print("absoluteErrorCriterion : {}".format(absoluteErrorCriterion))
    clusterLabels = np.argmin(pairwiseDistances, axis = 1)
    # plot_cluster(XNumerical, self.K, representativeObjectsIdx, clusterLabels)

    while True: 
      anyCenterUpdated = False 
      for k in range(self.K):
        np.random.shuffle(nonRepresentativeObjectsIdx)
        tempRepresentativeObjectsIdx = deepcopy(representativeObjectsIdx)
        for i in range(len(nonRepresentativeObjectsIdx)):
          tempRepresentativeObjectsIdx[k] = nonRepresentativeObjectsIdx[i]
          tempPairwiseDistances = self.calculate_distances_from_representativeObjects(XNumerical, XOrdinal, XNominal, tempRepresentativeObjectsIdx)
          tempAbsoluteErrorCriterion = np.sum(np.min(tempPairwiseDistances, axis = 1))

          if tempAbsoluteErrorCriterion < absoluteErrorCriterion:
            absoluteErrorCriterion = tempAbsoluteErrorCriterion
            withinClusterVariance = np.sum(np.min(tempPairwiseDistances**2, axis = 1))
            nonRepresentativeObjectsIdx[i], representativeObjectsIdx[k] = representativeObjectsIdx[k], nonRepresentativeObjectsIdx[i]
            clusterLabels = np.argmin(tempPairwiseDistances, axis = 1)
            anyCenterUpdated = True
            # plot_cluster(XNumerical, self.K, representativeObjectsIdx, clusterLabels)
            # print("representativeObjectsIdx : {}".format(representativeObjectsIdx))
            # print("absoluteErrorCriterion : {}".format(absoluteErrorCriterion))
            break

      if not anyCenterUpdated:
        break

    return representativeObjectsIdx, absoluteErrorCriterion, clusterLabels, withinClusterVariance

  def calculate_hamming_distance(self, X, Y):
    pairwiseHammingDistance = []
    for x in X:
      distance = (x != Y).astype(int).sum(axis = 1) 
      pairwiseHammingDistance.append(distance)

    return np.array(pairwiseHammingDistance)

  def normalize_numerical_values(self, XNumerical):
    mx, mn = np.array([]), np.array([])
    for attr in self.attributeInfo:
      if attr['type'] == 'numerical':
        mx = np.append(mx, attr['max'])
        mn = np.append(mn, attr['min'])
    
    XNumerical = (XNumerical - mn) / (mx - mn)
    return XNumerical

  def format_ordinal_values(self, XOrdinal):
    ranks = []
    for attr in self.attributeInfo:
      if attr['type'] == 'ordinal':
        ranks.append(attr['rank'])

    for j in range(XOrdinal.shape[1]):
      rank = ranks[j]
      if len(rank):
        for i in range(XOrdinal.shape[0]):
          XOrdinal[i, j] = rank[XOrdinal[i, j]]

    XOrdinal = XOrdinal.astype(float)

    # scaling each attribute in the range [0, 1]
    for j in range(XOrdinal.shape[1]):
      XOrdinal[:, j] = (XOrdinal[:, j] - 1)/(np.max(XOrdinal[:, j]) - 1)
    
    return XOrdinal

  def get_attribute_info(self, attributeTypes, ranks, symmetricity):
    attributeInfo = []
    
    for i in range(self.X.shape[1]):
      column = self.X[:, i]
      distinctValues = np.unique(column)
      if len(attributeTypes) <= i:
        attributeTypes.append(self.dataType)

      attributeInfo.append({
          'idx' : i,
          'type' : attributeTypes[i],
          'distinctValues' : distinctValues if attributeTypes[i] != 'numerical' else None,
          'max' : np.amax(column.astype(float)) if attributeTypes[i] == 'numerical' else None,
          'min' : np.amin(column.astype(float)) if attributeTypes[i] == 'numerical' else None,
          'rank' : ranks[i] if i in ranks else {},
          'symmetricity' : symmetricity[i] if i in symmetricity else None
      })

    return np.array(attributeInfo)

  def seperate_mixed_data(self):
    attributeTypes = np.array([])
    symmetricIdxs, asymmetricIdxs = np.array([]), np.array([])
    for attr in self.attributeInfo:
      attributeTypes = np.append(attributeTypes, attr['type'])
      if attr['type'] == 'binary' and attr['symmetricity'] == 'symmetric':
        symmetricIdxs = np.append(symmetricIdxs, attr['idx'])
      elif attr['type'] == 'binary' and attr['symmetricity'] == 'asymmetric':
        asymmetricIdxs = np.append(asymmetricIdxs, attr['idx'])
    
    XNumerical = self.X[:, attributeTypes == 'numerical']
    XNominal = self.X[:, attributeTypes == 'nominal']
    XOrdinal = self.X[:, attributeTypes == 'ordinal']
    XBinarySymmetric = self.X[:, symmetricIdxs] if len(symmetricIdxs) else np.array([])
    XBinaryAsymmetric = self.X[:, asymmetricIdxs] if len(asymmetricIdxs) else np.array([])

    XNumerical = XNumerical.astype(float)
    # XBinarySymmetric = XBinarySymmetric.astype(int)
    # XBinaryAsymmetric = XBinaryAsymmetric.astype(int)

    return XNumerical, XNominal, XOrdinal, XBinarySymmetric, XBinaryAsymmetric

  def calculate_pairwise_distances(self, XNumerical, XOrdinal, XNominal, ObjectsIdx1, objectsIdx2):
    ## eliminating square root from the euclidean distance calculation
    ## that means pairwise squared distances from cluster centers and the samples 
    pairwiseDistancesNumerical = 0 if len(XNumerical) == 0 else cdist(XNumerical[ObjectsIdx1, :], XNumerical[objectsIdx2, :], metric='euclidean') ** 2
    pairwiseDistancesOrdinal = 0 if len(XOrdinal) == 0 else cdist(XOrdinal[ObjectsIdx1, :], XOrdinal[objectsIdx2, :], metric='euclidean') ** 2  
    # hamming distance for nominal attributes
    pairwiseDistancesNominal = 0 if len(XNominal) == 0 else self.calculate_hamming_distance(XNominal[ObjectsIdx1, :], XNominal[objectsIdx2, :]) 

    pairwiseDistances = pairwiseDistancesNumerical + pairwiseDistancesNominal + pairwiseDistancesOrdinal
    return np.sqrt(pairwiseDistances)

  def calculate_silhouette_coefficient(self):
    s = np.array([])
    XNumerical, XNominal, XOrdinal = self.preprocess_X()
    for i in range(self.X.shape[0]):
      minn = np.inf
      for k in range(self.K):
        clusterObjectsIdx = (self.clusterLabels == k)
        pairwiseDistances = self.calculate_pairwise_distances(XNumerical, XOrdinal, XNominal, np.array([i]), clusterObjectsIdx)
        if self.clusterLabels[i] == k:
          avgDistance = 0.0 if clusterObjectsIdx.sum() == 1 else pairwiseDistances.sum() / (clusterObjectsIdx.sum()-1)
          a = avgDistance
        else:
          avgDistance = pairwiseDistances.sum() / clusterObjectsIdx.sum()
          minn = min(minn, avgDistance)
      b = minn
      s = np.append(s, (b-a) / max(a,b) )

    return np.mean(s)  

  def calculate_bcubed_precision(self):
    bCubedPrecision = 0.0
    for i in range(self.X.shape[0]):
      clusterObjectsIdx = (self.clusterLabels == self.clusterLabels[i])
      correctNess = (self.Y[clusterObjectsIdx] == self.Y[i]).sum()
      avgCorrectNess = 0 if clusterObjectsIdx.sum() == 1 else (correctNess-1)/ (clusterObjectsIdx.sum()-1)
      bCubedPrecision += avgCorrectNess

    return bCubedPrecision/ self.X.shape[0]

  def calculate_bcubed_recall(self):
    bCubedRecall = 0.0
    for i in range(self.X.shape[0]):
      sameClassObjectsIdx = (self.Y == self.Y[i])
      correctNess = (self.clusterLabels[sameClassObjectsIdx] == self.clusterLabels[i]).sum()
      avgCorrectNess = 0 if sameClassObjectsIdx.sum() == 1 else (correctNess-1)/ (sameClassObjectsIdx.sum()-1)
      bCubedRecall += avgCorrectNess
      
    return bCubedRecall/ self.X.shape[0]

  def predict_one(self, x):
    return

  def predict(self, XTest):
    return

In [None]:
def plot_cluster(X, K, representativeObjectsIdx, clusterLabels):
    color = ["red","green", "blue", "yellow", "black"]
    for k in range(K):
        plt.scatter(X[clusterLabels == k, 0], X[clusterLabels == k, 1], color=color[k])
    
    plt.scatter(X[representativeObjectsIdx, 0] , X[representativeObjectsIdx, 1], color="black")
    plt.show()

##Running on Sample datasets


In [None]:
# dName = 'sampleDataset-1.csv'
# filePath = '{}Datasets/{}'.format(directoryPath, dName)
# df = pd.read_csv(filePath, sep=",", header=None)

# X = df.to_numpy()
# Y = np.array([])
# print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))

# kMedoidsClustering = KMedoidsClustering(X, Y, K = 4, dataType="numerical")
# kMedoidsClustering.train()
# plot_cluster(kMedoidsClustering.normalize_numerical_values(X), 4, kMedoidsClustering.representativeObjectsIdx, kMedoidsClustering.clusterLabels)

In [None]:
# dName = 'sampleDataset-2.csv'
# filePath = '{}Datasets/{}'.format(directoryPath, dName)
# df = pd.read_csv(filePath, sep=",", header=None)

# X = df.to_numpy()
# Y = np.array([])
# print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))

# kMedoidsClustering = KMedoidsClustering(X, Y, K = 4, dataType="numerical")
# kMedoidsClustering.train()

In [None]:
# from sklearn.datasets import load_iris
# X, Y = load_iris(return_X_y= True)
# Y = np.squeeze(Y)
# # print(Y)
# # print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))

# kMedoidsClustering = KMedoidsClustering(X, Y, K = 3, dataType="numerical")
# kMedoidsClustering.train()
# # print(kMedoidsClustering.absoluteErrorCriterion)
# # print(kMedoidsClustering.clusterLabels)

In [None]:
# 