In [1]:
from constants import *
import numpy as np
import os
import numpy as np
from enum import Enum
import pandas as pd

# Reading the data

In [2]:

TRAIN_FILE =os.path.join("C:/Users/jessy/Desktop/Train_Arabic_Digit.txt")

NUM_MFCC = 13

ALL_COEFFS = np.arange(0, NUM_MFCC, 1)

NUM_DIGITS = 10

NUM_TRAIN_BLOCKS = 660 * NUM_DIGITS

In [3]:
def convert_list_to_mask(list_of_indices):
    mask = [False] * NUM_MFCC
    for index in list_of_indices:
        assert (0 <= index < NUM_MFCC)
        mask[index] = True
    return mask

In [4]:
def add_digit(digits, current_digit):
    """
    :param digits: List of digits, which are each a numpy array
    :param current_digit: Current digit, which is a list of the coefficients for the digit
    :return:
    Formats and reshapes the current_digit list as a numpy array and appends it to digits and returns digits
    """
    digit_matrix = np.asarray(current_digit)
    digit_matrix = np.reshape(digit_matrix, (-1, len(current_digit[0])))
    digits.append(digit_matrix)
    return digits

# Loading the data

In [5]:
def load_data(filepath, coeffs=ALL_COEFFS):
    """
    :param filepath: Path for the file to read, either the TRAIN_FILE or TEST_FILE
    :return:
    Organizes the data in the input file into a list of matrices. Each matrix is the data for a spoken digit,
     where the columns are the coefficients and each row is the index of the time window
    """
    mask = convert_list_to_mask(coeffs)

    digits = []
    labels = []
    current_digit = []
    with open(filepath, 'r') as f:
        # For each block, append the items into one long list and then reshape to be (-1, 13) and append to digits
        for idx, line in enumerate(f):
            if idx == 0:
                continue
            if line.isspace() or line == "\n":
                digits = add_digit(digits, current_digit)
                current_digit = []
            else:
                mfcc = list(map(float, line.split(' ')))
                mfcc = np.asarray(mfcc)
                mfcc_filtered = mfcc[mask]
                current_digit.append(mfcc_filtered)
        if len(current_digit):
            digits = add_digit(digits, current_digit)

    num_entries_per_digit = int(len(digits) / 10)  # Should be 660 for train, 220 for test
    for digit_value in range(NUM_DIGITS):
        for i in range(num_entries_per_digit):
            labels.append(digit_value)

    return np.asarray(digits, dtype=object), np.asarray(labels)


In [6]:
def get_train_data(coeffs=ALL_COEFFS):
    return load_data(TRAIN_FILE, coeffs)


In [7]:
coeffs = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
train_data, train_labels = get_train_data()

In [8]:
print(train_labels)
print(np.unique(train_labels))
print(len(train_labels))

[0 0 0 ... 9 9 9]
[0 1 2 3 4 5 6 7 8 9]
6600


# Selecting 20 from each class

In [9]:
from sklearn.preprocessing import StandardScaler

true_labels = np.repeat(np.arange(10), 20)  
points_per_class = 20
selected_indices = []

for class_label in np.unique(true_labels):
    indices_for_class = np.where(train_labels == class_label)[0]
    selected_indices.extend(indices_for_class[:points_per_class])

selected_data = train_data[selected_indices]

print(selected_indices)
print(len(selected_indices))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2640, 2641, 2642, 2643, 2644, 2645, 2646, 2647, 2648, 2649, 2650, 2651, 2652, 2653, 2654, 2655, 2656, 2657, 2658, 2659, 3300, 3301, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309, 3310, 3311, 3312, 3313, 3314, 3315, 3316, 3317, 3318, 3319, 3960, 3961, 3962, 3963, 3964, 3965, 3966, 3967, 3968, 3969, 3970, 3971, 3972, 3973, 3974, 3975, 3976, 3977, 3978, 3979, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629, 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 5280, 5281, 5282, 5283, 5284, 5285, 5286, 5287, 5288, 5289, 5290, 5291, 5292, 5293, 5294, 5295, 5296, 5297, 5

In [10]:
length_of_selected_series = np.array([])

for serie in selected_data:
    length_of_selected_series = np.append(length_of_selected_series, len(serie))
    
print(max(length_of_selected_series))
print(min(length_of_selected_series))


49.0
23.0


# Padding and Reverse Padding 

In [11]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences


def reverse_padding(padded_sequences, padding_value=-1):
    mask = (padded_sequences != padding_value)
    unpadded_sequences = [seq[mask[i]] for i, seq in enumerate(padded_sequences)]
    return unpadded_sequences

padded_data = pad_sequences(selected_data, padding = 'post' , value= -1, dtype=object)

print(padded_data.shape)


reshaped_data = padded_data.reshape(padded_data.shape[0], -1)

print(reshaped_data.shape)

unpadded_data = reverse_padding(padded_data,-1)


(200, 49, 13)
(200, 637)


In [12]:
print("Original data:")
print(selected_data)

Original data:
[array([[-8.1101e-01, -7.2382e+00,  1.5429e+00, -6.4774e-01,  1.4271e+00,
          6.1356e-01,  3.6516e-01,  8.8906e-02,  4.7031e-01,  9.8844e-01,
          4.4692e-02,  2.0817e-01,  5.1140e-01],
        [-3.7028e-01, -7.1336e+00,  1.8856e+00, -3.4316e-01,  9.6733e-01,
          3.2763e-01,  4.2988e-01,  5.0479e-01,  4.1533e-01,  2.8804e-01,
          8.6109e-02,  6.2690e-01,  7.8115e-01],
        [ 5.9659e-01, -8.3059e+00,  1.6943e+00, -6.6611e-01,  3.4967e-01,
         -1.7425e-01,  8.2077e-01,  1.2611e+00,  4.1653e-01,  5.0050e-01,
          5.7163e-01,  4.5316e-01,  6.4465e-01],
        [ 1.4585e+00, -8.1957e+00,  1.8454e+00, -1.1496e+00,  8.2660e-01,
         -5.1313e-01,  6.7443e-02,  2.5637e-01,  1.1500e-01, -1.0915e-01,
          8.5991e-02,  6.9064e-01,  3.3769e-01],
        [ 2.0824e+00, -8.6670e+00,  1.1995e+00, -1.1240e+00,  1.2445e+00,
         -1.0251e-01,  9.9867e-01,  5.7174e-01,  1.0384e+00,  1.7564e-01,
         -3.2857e-02,  5.3229e-01,  3.2941e-01],


In [13]:
print("\nPadded data:")
print(padded_data)


Padded data:
[[[-0.81101 -7.2382 1.5429 ... 0.044692 0.20817 0.5114]
  [-0.37028 -7.1336 1.8856 ... 0.086109 0.6269 0.78115]
  [0.59659 -8.3059 1.6943 ... 0.57163 0.45316 0.64465]
  ...
  [-1 -1 -1 ... -1 -1 -1]
  [-1 -1 -1 ... -1 -1 -1]
  [-1 -1 -1 ... -1 -1 -1]]

 [[-2.5929 -2.889 0.29554 ... 0.0050341 0.12431 0.44326]
  [-0.099047 -7.1106 1.9788 ... 1.0089 0.7256 0.37813]
  [-0.91929 -6.6428 1.4909 ... 0.31544 0.49671 0.10368]
  ...
  [-1 -1 -1 ... -1 -1 -1]
  [-1 -1 -1 ... -1 -1 -1]
  [-1 -1 -1 ... -1 -1 -1]]

 [[1.2158 -8.1442 1.3789 ... 0.95991 0.38903 0.66992]
  [1.1349 -8.1129 1.2175 ... 0.89815 0.58747 0.84121]
  [-1.5079 -6.2549 1.9707 ... 0.81544 0.024665 0.07027]
  ...
  [-1 -1 -1 ... -1 -1 -1]
  [-1 -1 -1 ... -1 -1 -1]
  [-1 -1 -1 ... -1 -1 -1]]

 ...

 [[3.079 -3.672 0.9792 ... 0.61481 -0.35329 -0.014982]
  [3.669 -3.5752 0.87549 ... 0.29005 0.08524 -0.11282]
  [3.6754 -3.0152 0.58462 ... 0.24898 0.34973 0.143]
  ...
  [-1 -1 -1 ... -1 -1 -1]
  [-1 -1 -1 ... -1 -1 -1]
  

In [14]:
print("\nUnpadded data:")
print(unpadded_data)


Unpadded data:
[array([-0.81101, -7.2382, 1.5429, -0.64774, 1.4271, 0.61356, 0.36516,
       0.088906, 0.47031, 0.98844, 0.044692, 0.20817, 0.5114, -0.37028,
       -7.1336, 1.8856, -0.34316, 0.96733, 0.32763, 0.42988, 0.50479,
       0.41533, 0.28804, 0.086109, 0.6269, 0.78115, 0.59659, -8.3059,
       1.6943, -0.66611, 0.34967, -0.17425, 0.82077, 1.2611, 0.41653,
       0.5005, 0.57163, 0.45316, 0.64465, 1.4585, -8.1957, 1.8454,
       -1.1496, 0.8266, -0.51313, 0.067443, 0.25637, 0.115, -0.10915,
       0.085991, 0.69064, 0.33769, 2.0824, -8.667, 1.1995, -1.124, 1.2445,
       -0.10251, 0.99867, 0.57174, 1.0384, 0.17564, -0.032857, 0.53229,
       0.32941, 1.9996, -8.5369, 1.3252, -0.64902, 1.2954, -0.17102,
       0.78212, -0.14486, 0.18813, -0.1401, 0.67831, 0.59122, -0.001105,
       2.5784, -7.7573, 2.1048, -1.1125, 1.0108, -0.69216, 0.7215,
       0.16652, 0.57481, 0.13809, 0.038214, -0.41387, -0.11564, 2.2584,
       -8.6071, 0.90713, -1.3752, 0.12514, -0.66582, 0.60558, 0.25

In [15]:
print(len(selected_data))
print(len(padded_data))
print(len(unpadded_data))

200
200
200


# User Defined Function

In [16]:
import pyclustering
from pyclustering.utils.metric import type_metric, distance_metric

from fastdtw import fastdtw

padding_value = -1

def unpadd_data(seq, padded_value = -1):
        return [val for val in seq if val != padded_value]

user_function = lambda point1, point2: fastdtw(unpadd_data(point1, padding_value), unpadd_data(point2, padding_value))[0]

metric1 = distance_metric(type_metric.USER_DEFINED, func=user_function)


# Distance Matrix(DTW)

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def calculate_distance_matrix(padded_sequences, padding_value=-1, distance_metric_func= None):
    unpadded_sequences = reverse_padding(padded_sequences,padding_value)
    num_sequences = len(unpadded_sequences)
    distance_matrix = np.zeros((num_sequences, num_sequences))

    for i in range(num_sequences):
        for j in range(i + 1, num_sequences):
            distance_matrix[i, j] = distance_metric_func(unpadded_sequences[i], unpadded_sequences[j])
            distance_matrix[j, i] = distance_matrix[i, j]

    return distance_matrix


padded_data = pad_sequences(selected_data, padding = 'post',value= -1, dtype=object)

distance_matrix = calculate_distance_matrix(padded_data, padding_value= -1, distance_metric_func= metric1)

print("Distance Matrix:")
print(distance_matrix)

Distance Matrix:
[[  0.         283.1744909  396.3121355  ... 535.00876525 525.7115176
  443.3007059 ]
 [283.1744909    0.         289.86600915 ... 497.65606065 420.8069698
  431.3949849 ]
 [396.3121355  289.86600915   0.         ... 492.7562228  534.94811745
  530.31201845]
 ...
 [535.00876525 497.65606065 492.7562228  ...   0.         177.83234895
  174.50759635]
 [525.7115176  420.8069698  534.94811745 ... 177.83234895   0.
  140.4218166 ]
 [443.3007059  431.3949849  530.31201845 ... 174.50759635 140.4218166
    0.        ]]


In [18]:
distance_matrix.shape

(200, 200)

# Reshaped data

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_sequences, max_length, num_features = padded_data.shape
print(num_sequences)
print(max_length)
print(num_features)
reshaped_data = padded_data.reshape(num_sequences, -1)  
print(reshaped_data.shape)

200
49
13
(200, 637)


# Custom davis bouldin

In [20]:
from sklearn.metrics import pairwise_distances
import numpy as np

def davis_bouldin_score1(X, labels):
    n_clusters = len(np.unique(labels))
    cluster_centers = [np.mean(X[labels == i], axis=0) for i in range(n_clusters)]
    
    distances = pairwise_distances(X, cluster_centers, metric = metric1)
    
    avg_distances = np.zeros(n_clusters)
    for i in range(n_clusters):
        avg_distances[i] = np.mean(distances[labels == i, i])
    
    db = 0
    for i in range(n_clusters):
        max_similarities = np.max([(avg_distances[i] + avg_distances[j]) / np.linalg.norm(cluster_centers[i] - cluster_centers[j]) for j in range(n_clusters) if j != i])
        db += max_similarities
    
    return db / n_clusters

# MDS(Feature matrix)

In [21]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.manifold import MDS
import numpy as np

n_samples = distance_matrix.shape[0]

mds = MDS(n_components=num_features, dissimilarity='precomputed', random_state=42)

feature_matrix = mds.fit_transform(distance_matrix)

feature_matrix

array([[ 137.28160156,  -92.66078706,   36.44104081, ...,    3.50531105,
         191.9055605 ,  -11.54911272],
       [  69.83199967,  -92.3968055 ,   -4.94500433, ...,  -76.10189938,
          -7.7276799 ,  -92.72915907],
       [  71.35146497,   35.30614316,   -9.57539699, ...,   17.22935937,
        -105.19710616,  -17.5012862 ],
       ...,
       [-122.57954408,   38.54120901,   76.27241992, ...,  -29.35090278,
        -134.23747289,   31.22578564],
       [ -73.66299795,   -1.97885994,   59.22347281, ...,   33.90941534,
         -78.4288463 ,   26.36917582],
       [ -59.47237238,  -74.80314171,   41.52578219, ...,  -21.39522072,
        -104.5172495 ,   21.21372311]])

In [22]:
feature_matrix.shape

(200, 13)

# BIRCH

In [23]:
from sklearn.cluster import Birch
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import v_measure_score


birch_model = Birch(n_clusters= 10, threshold=0.5)

birch_model.fit(feature_matrix)

labels = birch_model.labels_

silhouette_score_value = silhouette_score(distance_matrix, labels, metric = 'precomputed')
davies_bouldin_score_value = davies_bouldin_score(reshaped_data, labels)
davies_bouldin_score_value1 = davis_bouldin_score1(reshaped_data, labels)
v_measure = v_measure_score(true_labels, labels)


print("Cluster Labels:", labels)
print("Silhouette Score: ",silhouette_score_value)
print("Davies Bouldin Score: ",davies_bouldin_score_value)
print("Davies Bouldin Score From Scratch: ",davies_bouldin_score_value1)
print("V-Measure: ",v_measure)



Cluster Labels: [5 5 5 5 5 5 5 2 5 5 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 9 9 9 9 9 9 9 9 9 9 9
 9 9 9 9 9 9 9 9 9 5 5 5 5 5 5 5 5 5 5 5 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 8 8 8 8 8 8 8 8 8 8 7 7 7 7 7 7 7 7 7 7 6 6 6 2 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
Silhouette Score:  0.1842739182209675
Davies Bouldin Score:  1.672855660235199
Davies Bouldin Score From Scratch:  30.2057603437057
V-Measure:  0.8754877180610987


# MiniBatchKMeans

In [24]:
from sklearn.cluster import MiniBatchKMeans
mbk = MiniBatchKMeans(n_clusters=10, random_state=42)
mbk.fit(feature_matrix)
mbk_labels = mbk.labels_

silhouette_score_value = silhouette_score(distance_matrix, mbk_labels,metric = 'precomputed')
davies_bouldin_score_value = davies_bouldin_score(reshaped_data, mbk_labels)
davies_bouldin_score_value1 = davis_bouldin_score1(reshaped_data, mbk_labels)
v_measure = v_measure_score(true_labels, mbk_labels)


print("Cluster Labels:", mbk_labels)
print("Silhouette Score: ",silhouette_score_value)
print("Davies Bouldin Score: ",davies_bouldin_score_value)
print("Davies Bouldin Score From Scratch: ",davies_bouldin_score_value1)
print("V-Measure: ",v_measure)

Cluster Labels: [8 8 8 8 8 8 8 4 8 8 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 0 0 0 0 0 0 0 0 0 3 9 0 0 9 0 0 0 9 9 6 6 6 6 6 6 6 6 6 6 2 2 2 2
 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 8 8 8 8 8 8 8 8 8 8 1 1 1 1 1 1 1 1 1 1 3 4 4 3 3 3 4 4
 4 3 4 4 4 4 4 4 4 4 4 4 6 6 6 6 6 6 6 6 6 6 9 9 9 9 9 9 9 9 9 9 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
Silhouette Score:  0.17252593818700868
Davies Bouldin Score:  2.064277045445933
Davies Bouldin Score From Scratch:  36.45463062622061
V-Measure:  0.8116306243463898


# Results DataFrame 

In [25]:
padded_data_notinf = pad_sequences(selected_data, padding = 'post',value= -1, dtype=object)
reshaped_data_notinf = padded_data_notinf.reshape(num_sequences, -1)  
unpadded_data = reverse_padding(reshaped_data_notinf, -1)
print(padded_data_notinf.shape)
print(reshaped_data_notinf.shape)

(200, 49, 13)
(200, 637)


In [26]:
import pandas as pd
from sklearn.cluster import Birch, KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score, davies_bouldin_score, v_measure_score

methods = ['Birch', 'KMeans']
v_measure_scores = []
davies_bouldin_scores = []
davies_bouldin_scores_scratch = []
silhouette_scores = []


n_clusters = 10
random_state = 42


for method in methods:
    if method == 'Birch':
        clustering_model = Birch(n_clusters=n_clusters, threshold=0.5)
    elif method == 'KMeans':
        clustering_model = KMeans(n_clusters=n_clusters, random_state=random_state)

    
    clustering_model.fit(feature_matrix)
    labels = clustering_model.labels_


    print("Cluster Labels:", labels)
    print("True Labels:", true_labels)


    
    silhouette_score_value = silhouette_score(distance_matrix, labels, metric='precomputed')

    
    davies_bouldin_score_value = davies_bouldin_score(reshaped_data_notinf, labels)
    
    davies_bouldin_score_value1 = davis_bouldin_score1(reshaped_data_notinf, labels)  

    
    v_measure = v_measure_score(true_labels, labels)

    silhouette_scores.append(silhouette_score_value)

    davies_bouldin_scores.append(davies_bouldin_score_value)

    davies_bouldin_scores_scratch.append(davies_bouldin_score_value1)

    v_measure_scores.append(v_measure)

Cluster Labels: [5 5 5 5 5 5 5 2 5 5 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 9 9 9 9 9 9 9 9 9 9 9
 9 9 9 9 9 9 9 9 9 5 5 5 5 5 5 5 5 5 5 5 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 8 8 8 8 8 8 8 8 8 8 7 7 7 7 7 7 7 7 7 7 6 6 6 2 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
True Labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9
 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
Cluster Labels: [2 2 2 2 2 2 2 2 2 2 8 8 8 1 1 1 1 1 1 1 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0 9 0 0 0 9 0 0

In [27]:
from IPython.display import display, HTML

scores = pd.DataFrame(columns=['Clustering Method', 'Silhouette Score', 'Davies Bouldin Score', 'Davies Bouldin Score From Scratch', 'V-Measure'])

for k, method in enumerate(methods):
    scores.loc[k] = [method, silhouette_scores[k], davies_bouldin_scores[k], davies_bouldin_scores_scratch[k], v_measure_scores[k]]

display(HTML(scores.to_html(index=False)))

Clustering Method,Silhouette Score,Davies Bouldin Score,Davies Bouldin Score From Scratch,V-Measure
Birch,0.184274,1.672856,30.20576,0.875488
KMeans,0.168123,1.88445,33.582646,0.843298


In [28]:
print(reshaped_data_notinf.shape)
print(type(reshaped_data_notinf))

(200, 637)
<class 'numpy.ndarray'>


# 2D DCT 

# ZIGZAG Algorithm

In [29]:
from scipy.fftpack import dctn
import matplotlib.pyplot as plt

In [30]:
# Zigzag scan of a matrix
# Argument is a two-dimensional matrix of any size,
# not strictly a square one.
# Function returns a 1-by-(m*n) array,
# where m and n are sizes of an input matrix,
# consisting of its items scanned by a zigzag method.
#
# Matlab Code:
# Alexey S. Sokolov a.k.a. nICKEL, Moscow, Russia
# June 2007
# alex.nickel@gmail.com

import numpy as np

def zigzag(input):
    #initializing the variables
    #----------------------------------
    h = 0
    v = 0

    vmin = 0
    hmin = 0

    vmax = input.shape[0]
    hmax = input.shape[1]
    
    #print(vmax ,hmax)

    i = 0

    output = np.zeros(( vmax * hmax))
    #----------------------------------
    while ((v < vmax) and (h < hmax)):
        if ((h + v) % 2) == 0:   # going up
            if (v == vmin):
            	#print(1)
                output[i] = input[v, h]   # if we got to the first line
                if (h == hmax):
                    v = v + 1
                else:
                    h = h + 1                        
                i = i + 1
            elif ((h == hmax -1 ) and (v < vmax)):   # if we got to the last column
            	#print(2)
            	output[i] = input[v, h] 
            	v = v + 1
            	i = i + 1
            elif ((v > vmin) and (h < hmax -1 )):    # all other cases
            	#print(3)
            	output[i] = input[v, h] 
            	v = v - 1
            	h = h + 1
            	i = i + 1
        else:                                    # going down
        	if ((v == vmax -1) and (h <= hmax -1)):       # if we got to the last line
        		#print(4)
        		output[i] = input[v, h] 
        		h = h + 1
        		i = i + 1
        	elif (h == hmin):                  # if we got to the first column
        		#print(5)
        		output[i] = input[v, h] 
        		if (v == vmax -1):
        			h = h + 1
        		else:
        			v = v + 1
        		i = i + 1
        	elif ((v < vmax -1) and (h > hmin)):     # all other cases
        		#print(6)
        		output[i] = input[v, h] 
        		v = v + 1
        		h = h - 1
        		i = i + 1
        if ((v == vmax-1) and (h == hmax-1)):          # bottom right element
        	#print(7)        	
        	output[i] = input[v, h] 
        	break
    #print ('v:',v,', h:',h,', i:',i)
    return output




# Inverse zigzag scan of a matrix
# Arguments are: a 1-by-m*n array, 
# where m & n are vertical & horizontal sizes of an output matrix.
# Function returns a two-dimensional matrix of defined sizes,
# consisting of input array items gathered by a zigzag method.
#
# Matlab Code:
# Alexey S. Sokolov a.k.a. nICKEL, Moscow, Russia
# June 2007
# alex.nickel@gmail.com


def inverse_zigzag(input, vmax, hmax):
	
	#print input.shape

	# initializing the variables
	#----------------------------------
	h = 0
	v = 0

	vmin = 0
	hmin = 0

	output = np.zeros((vmax, hmax))

	i = 0
    #----------------------------------

	while ((v < vmax) and (h < hmax)): 
		#print ('v:',v,', h:',h,', i:',i)   	
		if ((h + v) % 2) == 0:                 # going up
            
			if (v == vmin):
				#print(1)
				
				output[v, h] = input[i]        # if we got to the first line

				if (h == hmax):
					v = v + 1
				else:
					h = h + 1                        

				i = i + 1

			elif ((h == hmax -1 ) and (v < vmax)):   # if we got to the last column
				#print(2)
				output[v, h] = input[i] 
				v = v + 1
				i = i + 1

			elif ((v > vmin) and (h < hmax -1 )):    # all other cases
				#print(3)
				output[v, h] = input[i] 
				v = v - 1
				h = h + 1
				i = i + 1
		else:                                    # going down
			if ((v == vmax -1) and (h <= hmax -1)):       # if we got to the last line
				#print(4)
				output[v, h] = input[i] 
				h = h + 1
				i = i + 1
			elif (h == hmin):                  # if we got to the first column
				#print(5)
				output[v, h] = input[i] 
				if (v == vmax -1):
					h = h + 1
				else:
					v = v + 1
				i = i + 1		
			elif((v < vmax -1) and (h > hmin)):     # all other cases
				output[v, h] = input[i] 
				v = v + 1
				h = h - 1
				i = i + 1
		if ((v == vmax-1) and (h == hmax-1)):          # bottom right element
			#print(7)        	
			output[v, h] = input[i] 
			break


	return output


In [31]:
dct_results = [dctn(series, type=2, norm='ortho') for series in selected_data]

zigzag_results = []
for dct_result in dct_results:
    height, width = dct_result.shape[-2:]
    zigzag_result = zigzag(dct_result)
    zigzag_results.append(zigzag_result)

min_length = min(len(zigzag_result) for zigzag_result in zigzag_results)

truncated_results = [zigzag_result[:min_length] for zigzag_result in zigzag_results]
print(truncated_results)


[array([-6.28782093e+00, -3.02279822e+00,  1.69355129e+00, -6.22025558e-01,
       -5.19900779e+00,  2.15120584e+00,  2.83812967e+00, -5.11825565e+00,
       -2.83373273e+00,  1.08570423e-01,  7.89348698e-01, -3.99131647e+00,
       -4.57829910e+00, -2.78014377e+00, -1.37825744e-01,  2.84952028e+00,
        7.24243143e-01, -3.31932648e+00, -3.48084738e+00, -2.45122523e+00,
       -4.65555197e-01,  4.71130825e-01,  4.05823522e+00, -3.90738486e+00,
       -1.03212129e+00, -1.16083860e+00,  1.03364532e+00,  1.01613665e+01,
        1.35579729e+01, -2.72537636e-01, -4.67606879e-01,  1.67946661e+00,
       -4.75139851e+00,  5.64522507e+00,  1.76080151e+00, -3.32112561e-01,
        7.42014269e-01, -1.38883470e+00,  2.17499663e+00,  2.78112435e+00,
       -3.39649714e+00,  1.45051181e+00, -3.19143981e-01,  4.71297133e+00,
        1.23015971e+01,  1.32463515e+01,  5.67626559e+00, -3.35691221e+00,
        1.37458116e+00, -2.06163887e+00,  8.42469066e-03,  4.07724547e-01,
       -1.47614392e+00, 

In [32]:
z = zigzag(feature_matrix)
print(z)
print(z.shape)

[137.28160156 -92.66078706  69.83199967 ...   0.           0.
   0.        ]
(2600,)


In [33]:
truncated_results = np.array(truncated_results)
print(truncated_results.shape)

(200, 299)


# BIRCH with 2D DCT

In [34]:
from sklearn.cluster import Birch
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import v_measure_score


birch_model_dct = Birch(n_clusters=10, threshold=0.5)
birch_model_dct.fit(truncated_results)
labels_dct = birch_model_dct.labels_



sil = silhouette_score(reshaped_data, labels_dct)
davies_bouldin_score_value = davies_bouldin_score(reshaped_data, labels_dct)

v_measure = v_measure_score(true_labels, labels_dct)


print("Cluster Labels:", labels_dct)
print("Silhouette Score: ",sil)
print("Davies Bouldin Score: ",davies_bouldin_score_value)
print("Davies Bouldin Score From Scratch: ",davies_bouldin_score_value1)
print("V-Measure: ",v_measure)

Cluster Labels: [0 0 0 0 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 6 6 6 6 6 6 6 6 6 6 0
 0 0 6 6 6 6 6 6 6 4 4 4 4 0 4 4 4 4 4 0 4 4 4 4 4 4 4 4 4 2 2 2 9 2 9 2 2
 2 9 9 9 9 9 9 9 9 9 9 9 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
Silhouette Score:  0.2166704835771563
Davies Bouldin Score:  1.7994506425180252
Davies Bouldin Score From Scratch:  33.58264645052774
V-Measure:  0.9218289906371402


In [35]:
reshaped_data.shape

(200, 637)

In [36]:
labels_dct.shape

(200,)

In [40]:
birch_model_dct = Birch(n_clusters=10, threshold=0.5)
birch_model_dct.fit(feature_matrix)
labels_dct = birch_model_dct.labels_


silhouette_score_value = silhouette_score(distance_matrix, labels_dct,metric = 'precomputed')
davies_bouldin_score_value = davies_bouldin_score(reshaped_data, labels_dct)
davies_bouldin_score_value1 = davis_bouldin_score1(reshaped_data, labels_dct)
v_measure = v_measure_score(true_labels, labels_dct)


print("Cluster Labels:", labels_dct)
print("Silhouette Score: ",silhouette_score_value)
print("Davies Bouldin Score: ",davies_bouldin_score_value)
print("Davies Bouldin Score From Scratch: ",davies_bouldin_score_value1)
print("V-Measure: ",v_measure)

Cluster Labels: [5 5 5 5 5 5 5 2 5 5 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 9 9 9 9 9 9 9 9 9 9 9
 9 9 9 9 9 9 9 9 9 5 5 5 5 5 5 5 5 5 5 5 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 8 8 8 8 8 8 8 8 8 8 7 7 7 7 7 7 7 7 7 7 6 6 6 2 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
Silhouette Score:  0.1842739182209675
Davies Bouldin Score:  1.672855660235199
Davies Bouldin Score From Scratch:  30.2057603437057
V-Measure:  0.8754877180610987


# K-Means with 2D DCT

In [42]:
from sklearn.cluster import MiniBatchKMeans
mbk = MiniBatchKMeans(n_clusters=10, random_state=42)
mbk.fit(feature_matrix)
mbk_labels = mbk.labels_

silhouette_score_value = silhouette_score(distance_matrix, mbk_labels,metric = 'precomputed')
davies_bouldin_score_value = davies_bouldin_score(reshaped_data, mbk_labels)
davies_bouldin_score_value1 = davis_bouldin_score1(reshaped_data, mbk_labels)
v_measure = v_measure_score(true_labels, mbk_labels)


print("Cluster Labels:", mbk_labels)
print("Silhouette Score: ",silhouette_score_value)
print("Davies Bouldin Score: ",davies_bouldin_score_value)
print("Davies Bouldin Score From Scratch: ",davies_bouldin_score_value1)
print("V-Measure: ",v_measure)

Cluster Labels: [8 8 8 8 8 8 8 4 8 8 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 0 0 0 0 0 0 0 0 0 3 9 0 0 9 0 0 0 9 9 6 6 6 6 6 6 6 6 6 6 2 2 2 2
 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 8 8 8 8 8 8 8 8 8 8 1 1 1 1 1 1 1 1 1 1 3 4 4 3 3 3 4 4
 4 3 4 4 4 4 4 4 4 4 4 4 6 6 6 6 6 6 6 6 6 6 9 9 9 9 9 9 9 9 9 9 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
Silhouette Score:  0.17252593818700868
Davies Bouldin Score:  2.064277045445933
Davies Bouldin Score From Scratch:  36.45463062622061
V-Measure:  0.8116306243463898


# Results DataFrame(CT + 2D DCT)

In [43]:
import pandas as pd
from sklearn.cluster import Birch, KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score, davies_bouldin_score, v_measure_score, calinski_harabasz_score

methods = ['Birch', 'KMeans']
v_measure_scores = []
davies_bouldin_scores = []
davies_bouldin_scores_scratch = []
silhouette_scores = []
calinski_harabasz_scores = []

n_clusters = 10
random_state = 42

for method in methods:
    if method == 'Birch':
        clustering_model = Birch(n_clusters=n_clusters, threshold=0.5)
    elif method == 'KMeans':
        clustering_model = KMeans(n_clusters=n_clusters, random_state=random_state)

    
    clustering_model.fit(truncated_results)
    labels = clustering_model.labels_
    print("Cluster Labels:", labels)
    print("True Labels:", true_labels)

    silhouette_score_value = silhouette_score(truncated_results, labels)

    davies_bouldin_score_value = davies_bouldin_score(truncated_results, labels)

    v_measure = v_measure_score(true_labels, labels)

    calinski_harabasz_score1 = calinski_harabasz_score(truncated_results, labels)

    
    silhouette_scores.append(silhouette_score_value)
    davies_bouldin_scores.append(davies_bouldin_score_value)

    calinski_harabasz_scores.append(calinski_harabasz_score1)
    v_measure_scores.append(v_measure)

Cluster Labels: [0 0 0 0 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 6 6 6 6 6 6 6 6 6 6 0
 0 0 6 6 6 6 6 6 6 4 4 4 4 0 4 4 4 4 4 0 4 4 4 4 4 4 4 4 4 2 2 2 9 2 9 2 2
 2 9 9 9 9 9 9 9 9 9 9 9 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
True Labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9
 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
Cluster Labels: [8 8 8 8 8 8 8 9 8 8 2 2 8 8 8 8 8 8 8 8 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0

In [44]:
from IPython.display import display, HTML

scores = pd.DataFrame(columns=['Clustering Method', 'Silhouette Score', 'Davies Bouldin Score', 'Calinski Harabasz Index', 'V-Measure'])

for k, method in enumerate(methods):
    scores.loc[k] = [method, silhouette_scores[k], davies_bouldin_scores[k], calinski_harabasz_scores[k], v_measure_scores[k]]


display(HTML(scores.to_html(index=False)))

Clustering Method,Silhouette Score,Davies Bouldin Score,Calinski Harabasz Index,V-Measure
Birch,0.343417,1.245634,48.423397,0.921829
KMeans,0.310314,1.28258,44.525758,0.910682
