<a href="https://colab.research.google.com/github/mm6396/ClusterComp/blob/main/evaluation_modulus7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Imports Section**


***New Expremiments; ***

In [None]:
#-------------------------------------------------------------------------------
#Method for Preprocessing datasetsand replacing nulls :
#-------------------------------------------------------------------------------

import pandas as pd
import os

def processing_procedure(files):
    processed_files = []
    for filename in files:

        df = pd.read_excel(filename)


        df.interpolate(method='nearest', inplace=True, limit_direction='both')


        processed_filename = filename.split('.')[0] + '_processed.' + filename.split('.')[1]


        if os.path.exists(processed_filename):
            print(f"{processed_filename} already exists. Overwriting...")

        # Save (or overwrite) the processed data into the new spreadsheet.
        df.to_excel(processed_filename, index=False)

        processed_files.append(processed_filename)

    return processed_files


In [None]:
#-------------------------------------------------------------------------------
#------------GENERAL KEMENY YOUNG METHOD----------------------------------------
#-------------------------------------------------------------------------------
!pip install cylp
#sudo apt-get install coinor-cbc coinor-clp


import numpy as np
import scipy.sparse as sp
import itertools
import numpy as np
from __future__ import division
from timeit import default_timer as time
from cylp.cy import CyClpSimplex
from cylp.py.pivots import PositiveEdgePivot
import importlib
import itertools
import numpy as np
import scipy.sparse as sp
#from utils import combs, perms
#from fairlearn.reductions import extended_condorcet_simple

def extended_condorcet_simple(rankings):

    # assumes: cands -> 0,N-1
    n = rankings.shape[1]
    cands = np.arange(n)
    pairs = combs(range(n), 2)

    condorcet_rows, condorcet_cols = [], []

    for cand, other_cand in pairs:
        cand_pos = np.where(rankings == cand)[1]
        other_pos = np.where(rankings == other_cand)[1]

        if np.all(cand_pos < other_pos):
            condorcet_rows.append(cand)
            condorcet_cols.append(other_cand)
        elif np.all(other_pos < cand_pos):
            condorcet_rows.append(other_cand)
            condorcet_cols.append(cand)

    mat = sp.coo_matrix((np.ones(len(condorcet_rows)), (condorcet_rows, condorcet_cols)))
    return mat

def combs(a, r):
    """
    Return successive r-length combinations of elements in the array a.
    Should produce the same output as array(list(combinations(a, r))), but
    faster.
    """
    a = np.asarray(a)
    dt = np.dtype([('', a.dtype)]*r)
    b = np.fromiter(itertools.combinations(a, r), dt)
    b_ = b.view(a.dtype).reshape(-1, r)
    return b_

def perms(a, r):
    """
    Same as above with permutations
    """
    a = np.asarray(a)
    dt = np.dtype([('', a.dtype)]*r)
    b = np.fromiter(itertools.permutations(a, r), dt)
    b_ = b.view(a.dtype).reshape(-1, r)
    return b_

class KemenyRanking():
    def __init__(self, fp, verbose=True, condorcet_red=True):
        self.verbose = verbose
        self.condorcet_red = True
        self.parse_file(fp)
        self.build_Q()
        self.solve_ilp()
        self.postprocess()
        self.print_sol()

    def parse_file(self, fp):
        """ Reads and preprocesses input """
        # TODO add checks
        # TODO add specification
        if self.verbose:
            print('Parse input')

        with open(fp) as file:
            content = file.readlines()
            content = [x.strip() for x in content]                          # remove newlines
            content = [x.replace(':', '') for x in content]                 # remove ":"
            content = [np.array(x.split(), dtype=object) for x in content]  # split line into list
                                                                            # -> array

            raw_arr = np.array(content)
            self.voters_raw = raw_arr[:, 0]
            self.votes_raw = raw_arr[:, 1:]

            # Map to 0, N -> only votes!
            self.orig2id = {}
            self.id2orig = {}
            id_ = 0
            for i in np.unique(self.votes_raw):
                self.orig2id[i] = id_
                self.id2orig[id_] = i
                id_ += 1
            self.votes_arr = np.vectorize(self.orig2id.get)(self.votes_raw)

        if self.verbose:
            print('     ... finished')

            print('Problem statistics')
            print('  {} votes'.format(self.votes_arr.shape[0]))
            print('  {} candidates'.format(self.votes_arr.shape[1]))

    def build_Q(self):
        """ Creates incidence-matrix: form used in MIP-model """
        if self.verbose:
            print('Build incidence-matrix')

        N, n = self.votes_arr.shape                                              # N votes, n cands
        self.Q = np.zeros((n,n))
        for a,b in itertools.combinations(range(n), 2):
            a_pos = np.where(self.votes_arr == a)[1]
            b_pos = np.where(self.votes_arr == b)[1]
            plus = np.count_nonzero(a_pos < b_pos)
            minus = np.count_nonzero(a_pos > b_pos)
            self.Q[a,b] = plus
            self.Q[b,a] = minus

        if self.verbose:
            print('     ... finished')

    def solve_ilp(self):
        """ Solves problem exactly using MIP/ILP approach
            Used solver: CoinOR CBC
            Incidence-matrix Q holds complete information needed for opt-process
        """
        if self.verbose:
            print('Solve: build model')

        if self.condorcet_red:
            condorcet_red_mat = extended_condorcet_simple(self.votes_arr)

        n = self.Q.shape[0]
        x_n = n*n

        model = CyClpSimplex()                                           # MODEL
        x = model.addVariable('x', x_n, isInt=True)                      # VARS

        model.objective = self.Q.ravel()                                 # OBJ

        # x_ab = boolean (already int; need to constrain to [0,1])
        model += sp.eye(x_n) * x >= np.zeros(x_n)
        model += sp.eye(x_n) * x <= np.ones(x_n)

        idx = lambda i, j: np.ravel_multi_index((i, j), (n,n))

        # constraints for every pair
        start_time = time()
        n_pairwise_constr = n*(n-1)//2
        if self.verbose:
            print('  # pairwise constr: ', n_pairwise_constr)

        # Somewhat bloated just to get some vectorization / speed !
        combs_ = combs(range(n), 2)

        inds_a = np.ravel_multi_index(combs_.T, (n, n))
        inds_b = np.ravel_multi_index(combs_.T[::-1], (n, n))

        row_inds = np.tile(np.arange(n_pairwise_constr), 2)
        col_inds = np.hstack((inds_a, inds_b))

        pairwise_constraints = sp.coo_matrix((np.ones(n_pairwise_constr*2),
                                              (row_inds, col_inds)),
                                              shape=(n_pairwise_constr, n*n))
        end_time = time()
        if self.verbose:
            print("    Took {:.{prec}f} secs".format(end_time - start_time, prec=3))

        # and for every cycle of length 3
        start_time = time()
        n_triangle_constrs = n*(n-1)*(n-2)
        if self.verbose:
            print('  # triangle constr: ', n_triangle_constrs)

        # Somewhat bloated just to get some vectorization / speed !
        perms_ = perms(range(n), 3)

        inds_a = np.ravel_multi_index(perms_.T[(0,1), :], (n, n))
        inds_b = np.ravel_multi_index(perms_.T[(1,2), :], (n, n))
        inds_c = np.ravel_multi_index(perms_.T[(2,0), :], (n, n))

        row_inds = np.tile(np.arange(n_triangle_constrs), 3)
        col_inds = np.hstack((inds_a, inds_b, inds_c))

        triangle_constraints = sp.coo_matrix((np.ones(n_triangle_constrs*3),
                                              (row_inds, col_inds)),
                                              shape=(n_triangle_constrs, n*n))
        end_time = time()
        if self.verbose:
            print("    Took {:.{prec}f} secs".format(end_time - start_time, prec=3))


        model += pairwise_constraints * x == np.ones(n_pairwise_constr)
        model += triangle_constraints * x >= np.ones(n_triangle_constrs)

        if self.condorcet_red:
            I, J, V = sp.find(condorcet_red_mat)
            indices_pos = np.ravel_multi_index([J, I], (n,n))
            indices_neg = np.ravel_multi_index([I, J], (n,n))
            nnz = len(indices_pos)

            if self.verbose:
                print('  Extended Condorcet reductions: {} * 2 relations fixed'.format(nnz))

            lhs = sp.coo_matrix((np.ones(nnz*2),
                        (np.arange(nnz*2),
                         np.hstack((indices_pos, indices_neg)))),
                  shape=(nnz*2, n*n))
            rhs = np.hstack((np.ones(len(indices_pos)), np.zeros(len(indices_neg))))
            model += lhs * x == rhs

        cbcModel = model.getCbcModel()  # Clp -> Cbc model / LP -> MIP
        cbcModel.logLevel = self.verbose

        if self.verbose:
            print('Solve: run MIP\n')
        start_time = time()
        status = cbcModel.solve()           #-> "Call CbcMain. Solve the problem
                                            #   "using the same parameters used
                                            #   "by CbcSolver."
                                            # This deviates from cylp's docs which are sparse!
                                            # -> preprocessing will be used and is very important!
        end_time = time()
        if self.verbose:
            print("  CoinOR CBC used {:.{prec}f} secs".format(end_time - start_time, prec=3))

        x_sol = cbcModel.primalVariableSolution['x']
        self.obj_sol = cbcModel.objectiveValue
        x = np.array(x_sol).reshape((n, n)).round().astype(int)
        self.aggr_rank = np.argsort(x.sum(axis=0))[::-1]

    def postprocess(self):
        if self.verbose:
            print('Postprocessing')
        self.final_solution = np.vectorize(self.id2orig.get)(self.aggr_rank)
        if self.verbose:
            print('    ... finished')

    def print_sol(self):
        print('--------')
        print('SOLUTION')
        print('  objective: ', self.obj_sol)
        print('  aggregation: ')
        print(self.final_solution)




# Please Run : KemenyRanking('/content/yourfile_name.txt')






In [None]:
#Visulizing Data
!pip install scikit-learn-extra
!pip install fuzzy-c-means


import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, OPTICS, Birch
from sklearn_extra.cluster import KMedoids
from sklearn.mixture import GaussianMixture
from fcmeans import FCM
import matplotlib.pyplot as plt


files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)
datasets = [pd.read_excel(file, engine='openpyxl').drop('Depth', axis=1) for file in files_p]
scaled_datasets = [StandardScaler().fit_transform(df) for df in datasets]

# Define clustering methods
clustering_algorithms = {
    'KMeans': KMeans(n_clusters=3, random_state=42),
    'DBSCAN': DBSCAN(eps=0.5),
    'Agglomerative': AgglomerativeClustering(n_clusters=3 , linkage = 'complete'), #linkage = {'ward' , 'complete' , 'average' , 'single'}
    'OPTICS': OPTICS(),
    'KMedoids': KMedoids(n_clusters=3, random_state=42),
    'GMM': GaussianMixture(n_components=3, random_state=42),
    'BIRCH': Birch(n_clusters=3),
    'FCM': FCM(n_clusters=3)
}

# Visualization
n_rows = len(scaled_datasets)
n_cols = len(clustering_algorithms)

fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 15))

for row, scaled_data in enumerate(scaled_datasets):
    pca = PCA(n_components=2)
    data_pca = pca.fit_transform(scaled_data)

    for col, (algorithm_name, algorithm) in enumerate(clustering_algorithms.items()):
        if algorithm_name == 'GMM':
            cluster_labels = algorithm.fit_predict(scaled_data)
        elif algorithm_name == 'FCM':
            algorithm.fit(scaled_data)
            cluster_labels = algorithm.u.argmax(axis=1)
        else:
            cluster_labels = algorithm.fit_predict(scaled_data)

        axs[row, col].scatter(data_pca[:, 0], data_pca[:, 1], c=cluster_labels, cmap='viridis', edgecolor='k', s=50)
        axs[row, col].set_title(f'Dataset {row + 1} using {algorithm_name}')


        if row == 0:
            axs[row, col].set_xlabel(algorithm_name)
        if col == 0:
            axs[row, col].set_ylabel(f'Dataset {row + 1}')

plt.tight_layout()
plt.show()

In [None]:
#------------------------------------------------------------------------------
#Visulization Stifness based on X and Y positions
#------------------------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)

datasets = [pd.read_excel(file, engine='openpyxl') for file in files_p]

fig, axs = plt.subplots(1, len(datasets), figsize=(15, 5), sharex=True, sharey=True)
cbar_ax = fig.add_axes([.91, .3, .03, .4])

for ax, df, file in zip(axs, datasets, files_p):
    sc = ax.scatter(df['X Position'], df['Y Position'], c=df['Stiffness'], cmap='viridis')
    ax.set_title(file)
    ax.set_xlabel('X Position')
    if ax == axs[0]:
        ax.set_ylabel('Y Position')

fig.colorbar(sc, cax=cbar_ax, label='Stiffness')
plt.suptitle('Stiffness based on X and Y Position')
plt.tight_layout()
plt.subplots_adjust(right=0.9)
plt.show()



#------------------------------------------------------------------------------
# visulization Load based on x and Y positions
#------------------------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt


files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]

files_p = processing_procedure(files)



datasets = [pd.read_excel(file, engine='openpyxl') for file in files_p]


fig, axs = plt.subplots(1, len(datasets), figsize=(15, 5), sharex=True, sharey=True)
cbar_ax = fig.add_axes([.91, .3, .03, .4])

for ax, df, file in zip(axs, datasets, files_p):
    sc = ax.scatter(df['X Position'], df['Y Position'], c=df['Load'], cmap='viridis')
    ax.set_title(file)
    ax.set_xlabel('X Position')
    if ax == axs[0]:
        ax.set_ylabel('Y Position')

fig.colorbar(sc, cax=cbar_ax, label='Load')

plt.suptitle('Load based on X and Y Position')
plt.tight_layout()
plt.subplots_adjust(right=0.9)
plt.show()



#-------------------------------------------------------------------------------
#Vitualization MODULUS based X and Y positions
#-------------------------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt


files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)


datasets = [pd.read_excel(file, engine='openpyxl') for file in files_p]


fig, axs = plt.subplots(1, len(datasets), figsize=(15, 5), sharex=True, sharey=True)
cbar_ax = fig.add_axes([.91, .3, .03, .4])

for ax, df, file in zip(axs, datasets, files_p):
    sc = ax.scatter(df['X Position'], df['Y Position'], c=df['MODULUS'], cmap='viridis')
    ax.set_title(file)
    ax.set_xlabel('X Position')
    if ax == axs[0]:
        ax.set_ylabel('Y Position')

fig.colorbar(sc, cax=cbar_ax, label='Modulus')

plt.suptitle('Modulus based on X and Y Position')
plt.tight_layout()
plt.subplots_adjust(right=0.9)
plt.show()


#-------------------------------------------------------------------------------
#Show HARDNESS based on X and Y positions
#-------------------------------------------------------------------------------


import pandas as pd
import matplotlib.pyplot as plt


files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)


datasets = [pd.read_excel(file, engine='openpyxl') for file in files_p]


fig, axs = plt.subplots(1, len(datasets), figsize=(15, 5), sharex=True, sharey=True)
cbar_ax = fig.add_axes([.91, .3, .03, .4])

for ax, df, file in zip(axs, datasets, files_p):
    sc = ax.scatter(df['X Position'], df['Y Position'], c=df['HARDNESS'], cmap='viridis')
    ax.set_title(file)
    ax.set_xlabel('X Position')
    if ax == axs[0]:
        ax.set_ylabel('Y Position')

fig.colorbar(sc, cax=cbar_ax, label='Hardness')

plt.suptitle('Hardness based on X and Y Position')
plt.tight_layout()
plt.subplots_adjust(right=0.9)
plt.show()

In [None]:
# intall the required library packages
!pip install scikit-learn-extra
!pip install fuzzy-c-means

In [None]:
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#Tuning parameters for Agglomerative
#-------------------------------------------------------------------------------

import os,csv
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

param_grid = {
    'n_clusters': [3],
    'affinity': ['euclidean', 'manhattan'],
    'linkage': ['ward', 'complete', 'average']
}

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)

columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness']

eval_files = []
tuning_files = []

# Mapping of files to the position we're interested in
position_mapping = {
    0: 'X Position',  # test1
    1: 'Y Position',  # test2
    2: 'X Position',  # test3
    3: 'Y Position'   # test4
}
for idx, file in enumerate(files_p):
    df = pd.read_excel(file, engine='openpyxl')


    position = position_mapping[idx]
    eval_data = df[df[position] < 125]


    eval_filename = f'test_{idx+1}_eval.xlsx'
    eval_data.to_excel(eval_filename, index=False)
    eval_files.append(eval_filename)

    # tuning
    position = position_mapping[idx]
    tuning_data = df[df[position] > 125]


    tuning_filename = f'test_{idx+1}_tuning.xlsx'
    tuning_data.to_excel(tuning_filename, index=False)
    tuning_files.append(tuning_filename)




results = []
max_silhouette = -1
best_params = {}

for eval_filename in eval_files:
    eval_df = pd.read_excel(eval_filename, engine='openpyxl')
    eval_df = eval_df.drop(columns=columns_to_drop, errors='ignore')



    scaled_eval = StandardScaler().fit_transform(eval_df)

    for n_clusters in param_grid['n_clusters']:
        for affinity in param_grid['affinity']:
            for linkage in param_grid['linkage']:
                # 'ward' can only work with 'euclidean'
                if linkage == 'ward' and affinity != 'euclidean':
                    continue

                model = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, linkage=linkage)
                labels = model.fit_predict(scaled_eval)

                score = silhouette_score(scaled_eval, labels)

                # Append results to the list
                results.append({
                    'file': eval_filename,
                    'n_clusters': n_clusters,
                    'affinity': affinity,
                    'linkage': linkage,
                    'silhouette_score': score
                })



df_results = pd.DataFrame(results)

def concat_parameters(group):
    # This will force even single items into a comma-separated string format

    return ' '.join(group['parameters_combinations'].tolist())

df_results['parameters_combinations'] = df_results.iloc[:, 1:4].apply(lambda row: '"' + ','.join(row.dropna().astype(str)) + '"', axis=1)
df_results = df_results.drop(df_results.columns[1:4], axis=1)
df_results = df_results.sort_values(by='silhouette_score', ascending=False)
df_results['rank'] = df_results.groupby('file')['silhouette_score'].rank(method='first', ascending=False).astype(int)

grouped_combinations = df_results.groupby(['file']).apply(concat_parameters)

output_file = "Agglomerative+parameterTuning.xlsx"
df_results.to_excel(output_file, index=False)


with open('tuning_list_kemeny_agglomerative.txt', 'w') as f:
    i = 1
    for _, group_string in grouped_combinations.items():
        f.write(f'A{i} : {group_string}\n')
        i += 1












In [None]:
KemenyRanking('/content/tuning_list_kemeny_agglomerative.txt')

In [None]:
#-------------------------------------------------------------------------------
#Tuning parameters for KMeans
#-------------------------------------------------------------------------------

import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

param_grid_KMeans = {
    'n_clusters': [3],
    'init': ['k-means++', 'random'],
    'n_init': [10, 20, 30],
    'max_iter': [100, 200, 300],
    'random_state': [0, 42]
}

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)

columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness']

eval_files = []
tuning_files = []


position_mapping = {
    0: 'X Position',  # test1
    1: 'Y Position',  # test2
    2: 'X Position',  # test3
    3: 'Y Position'   # test4
}

for idx, file in enumerate(files_p):
    df = pd.read_excel(file, engine='openpyxl')


    position = position_mapping[idx]
    eval_data = df[df[position] < 125]

    eval_filename = f'test_{idx+1}_eval.xlsx'
    eval_data.to_excel(eval_filename, index=False)
    eval_files.append(eval_filename)



    position = position_mapping[idx]
    tuning_data = df[df[position] > 125]

    tuning_filename = f'test_{idx+1}_tuning.xlsx'
    tuning_data.to_excel(tuning_filename, index=False)
    tuning_files.append(tuning_filename)

results = []
max_silhouette = -1
best_params = {}

for eval_filename in eval_files:
    eval_df = pd.read_excel(eval_filename, engine='openpyxl')
    eval_df = eval_df.drop(columns=columns_to_drop, errors='ignore')

    scaled_eval = StandardScaler().fit_transform(eval_df)

    for n_clusters in param_grid_KMeans['n_clusters']:
        for init in param_grid_KMeans['init']:
            for n_init in param_grid_KMeans['n_init']:
                for max_iter in param_grid_KMeans['max_iter']:
                    for random_state in param_grid_KMeans['random_state']:
                        # KMeans clustering
                        model = KMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, random_state=random_state)
                        labels = model.fit_predict(scaled_eval)

                        score = silhouette_score(scaled_eval, labels)

                        # Append results to the list
                        results.append({
                            'file': eval_filename,
                            'n_clusters': n_clusters,
                            'init': init,
                            'n_init': n_init,
                            'max_iter': max_iter,
                            'random_state': random_state,
                            'silhouette_score': score
                        })

df_results = pd.DataFrame(results)

def concat_parameters(group):
    # This will force even single items into a comma-separated string format

    return ' '.join(group['parameters_combinations'].tolist())

df_results['parameters_combinations'] = df_results.iloc[:, 1:6].apply(lambda row: '"' + ','.join(row.dropna().astype(str)) + '"', axis=1)
df_results = df_results.drop(df_results.columns[1:6], axis=1)
df_results = df_results.sort_values(by='silhouette_score', ascending=False)
df_results['rank'] = df_results.groupby('file')['silhouette_score'].rank(method='first', ascending=False).astype(int)

grouped_combinations = df_results.groupby(['file']).apply(concat_parameters)

output_file = "Kmeans+parameterTuning.xlsx"
df_results.to_excel(output_file, index=False)


with open('tuning_list_Kemeny_kmeans.txt', 'w') as f:
    i = 1
    for _, group_string in grouped_combinations.items():
        f.write(f'A{i} : {group_string}\n')
        i += 1


In [None]:
KemenyRanking('/content/tuning_list_Kemeny_kmeans.txt')

In [None]:
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#Tuning parameters for DBSCAN
#-------------------------------------------------------------------------------
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN  # Import DBSCAN
from sklearn.metrics import silhouette_score

param_grid_DBSCAN = {
    'eps': [0.1, 0.2, 0.3],
    'min_samples': [5, 10, 15]
}

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)

columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness']

eval_files = []
tuning_files = []


position_mapping = {
    0: 'X Position',  # test1
    1: 'Y Position',  # test2
    2: 'X Position',  # test3
    3: 'Y Position'   # test4
}

for idx, file in enumerate(files_p):
    df = pd.read_excel(file, engine='openpyxl')


    position = position_mapping[idx]
    eval_data = df[df[position] < 125]

    eval_filename = f'test_{idx+1}_eval.xlsx'
    eval_data.to_excel(eval_filename, index=False)
    eval_files.append(eval_filename)



    position = position_mapping[idx]
    tuning_data = df[df[position] > 125]

    tuning_filename = f'test_{idx+1}_tuning.xlsx'
    tuning_data.to_excel(tuning_filename, index=False)
    tuning_files.append(tuning_filename)

results = []
max_silhouette = -1
best_params = {}

for eval_filename in eval_files:
    eval_df = pd.read_excel(eval_filename, engine='openpyxl')
    eval_df = eval_df.drop(columns=columns_to_drop, errors='ignore')

    scaled_eval = StandardScaler().fit_transform(eval_df)

    for eps in param_grid_DBSCAN['eps']:
        for min_samples in param_grid_DBSCAN['min_samples']:
            # DBSCAN clustering
            model = DBSCAN(eps=eps, min_samples=min_samples)
            labels = model.fit_predict(scaled_eval)

            # DBSCAN can produce -1 labels for noise, so we need to filter them out for silhouette score
            core_samples_mask = labels != -1
            labels_core = labels[core_samples_mask]
            scaled_eval_core = scaled_eval[core_samples_mask]

            # Silhouette score is only meaningful if there's more than one cluster found
            if len(set(labels_core)) > 1:
                score = silhouette_score(scaled_eval_core, labels_core)
            else:
                score = -1

            # Append results to the list
            results.append({
                'file': eval_filename,
                'eps': eps,
                'min_samples': min_samples,
                'silhouette_score': score
            })

df_results = pd.DataFrame(results)

def concat_parameters(group):
    # This will force even single items into a comma-separated string format

    return ' '.join(group['parameters_combinations'].tolist())

df_results['parameters_combinations'] = df_results.iloc[:, 1:3].apply(lambda row: '"' + ','.join(row.dropna().astype(str)) + '"', axis=1)
df_results = df_results.drop(df_results.columns[1:3], axis=1)
df_results = df_results.sort_values(by='silhouette_score', ascending=False)
df_results['rank'] = df_results.groupby('file')['silhouette_score'].rank(method='first', ascending=False).astype(int)

grouped_combinations = df_results.groupby(['file']).apply(concat_parameters)

output_file = "DBSCAN+parameterTuning.xlsx"
df_results.to_excel(output_file, index=False)


with open('tuning_list_Kemeny_DBSCAN.txt', 'w') as f:
    i = 1
    for _, group_string in grouped_combinations.items():
        f.write(f'A{i} : {group_string}\n')
        i += 1



In [None]:
KemenyRanking('/content/tuning_list_Kemeny_DBSCAN.txt')

In [None]:

#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#Tuning parameters for FCM
#-------------------------------------------------------------------------------

!pip install Fuzzy-c-means
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from fcmeans import FCM
from sklearn.metrics import silhouette_score

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)

# Columns to be excluded
columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness']

eval_files = []
tuning_files = []


position_mapping = {
    0: 'X Position',  # test1
    1: 'Y Position',  # test2
    2: 'X Position',  # test3
    3: 'Y Position'   # test4
}

for idx, file in enumerate(files_p):
    df = pd.read_excel(file, engine='openpyxl')


    position = position_mapping[idx]
    eval_data = df[df[position] < 125]

    eval_filename = f'test_{idx+1}_eval.xlsx'
    eval_data.to_excel(eval_filename, index=False)
    eval_files.append(eval_filename)



    position = position_mapping[idx]
    tuning_data = df[df[position] > 125]

    tuning_filename = f'test_{idx+1}_tuning.xlsx'
    tuning_data.to_excel(tuning_filename, index=False)
    tuning_files.append(tuning_filename)

# Define hyperparameters to search
n_clusters_values = [3]
m_values = [1.1, 1.5, 2.0]

results = []
best_score = -1
best_params = None

for eval_filename in eval_files:
    eval_df = pd.read_excel(eval_filename, engine='openpyxl')
    eval_df = eval_df.drop(columns=columns_to_drop, errors='ignore')
    scaled_data = StandardScaler().fit_transform(eval_df)

    for n_clusters in n_clusters_values:
        for m in m_values:
            fcm = FCM(n_clusters=n_clusters, m=m)
            fcm.fit(scaled_data)
            cluster_labels = fcm.predict(scaled_data)

            score = silhouette_score(scaled_data, cluster_labels)

            results.append({
                'file': eval_filename,
                'n_clusters': n_clusters,
                'm': m,
                'silhouette_score': score
            })


df_results = pd.DataFrame(results)

def concat_parameters(group):
    # This will force even single items into a comma-separated string format

    return ' '.join(group['parameters_combinations'].tolist())

df_results['parameters_combinations'] = df_results.iloc[:, 1:3].apply(lambda row: '"' + ','.join(row.dropna().astype(str)) + '"', axis=1)
df_results = df_results.drop(df_results.columns[1:3], axis=1)
df_results = df_results.sort_values(by='silhouette_score', ascending=False)
df_results['rank'] = df_results.groupby('file')['silhouette_score'].rank(method='first', ascending=False).astype(int)

grouped_combinations = df_results.groupby(['file']).apply(concat_parameters)

output_file = "FCM+parameterTuning.xlsx"
df_results.to_excel(output_file, index=False)


with open('tuning_list_Kemeny_FCM.txt', 'w') as f:
    i = 1
    for _, group_string in grouped_combinations.items():
        f.write(f'A{i} : {group_string}\n')
        i += 1


In [None]:
KemenyRanking('/content/tuning_list_Kemeny_FCM.txt')

In [None]:
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#Tuning parameters for GMM
#-------------------------------------------------------------------------------

!pip install scikit-learn-extra
!pip install fuzzy-c-means


import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)

# Columns to be excluded
columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness']

eval_files = []
tuning_files = []


position_mapping = {
    0: 'X Position',  # test1
    1: 'Y Position',  # test2
    2: 'X Position',  # test3
    3: 'Y Position'   # test4
}

for idx, file in enumerate(files_p):
    df = pd.read_excel(file, engine='openpyxl')


    position = position_mapping[idx]
    eval_data = df[df[position] < 125]

    eval_filename = f'test_{idx+1}_eval.xlsx'
    eval_data.to_excel(eval_filename, index=False)
    eval_files.append(eval_filename)



    position = position_mapping[idx]
    tuning_data = df[df[position] > 125]

    tuning_filename = f'test_{idx+1}_tuning.xlsx'
    tuning_data.to_excel(tuning_filename, index=False)
    tuning_files.append(tuning_filename)

# GMM parameters
param_grid_GMM = {
    'n_components': [3],
    'covariance_type': ['full', 'tied', 'diag', 'spherical'],
    'max_iter': [100, 200, 300],
    'random_state': [0, 42]
}

results = []
max_silhouette = -1
best_params = {}

for eval_filename in eval_files:
    eval_df = pd.read_excel(eval_filename, engine='openpyxl')
    eval_df = eval_df.drop(columns=columns_to_drop, errors='ignore')
    scaled_eval = StandardScaler().fit_transform(eval_df)

    for n_components in param_grid_GMM['n_components']:
        for covariance_type in param_grid_GMM['covariance_type']:
            for max_iter in param_grid_GMM['max_iter']:
                for random_state in param_grid_GMM['random_state']:
                    # GMM clustering
                    model = GaussianMixture(n_components=n_components, covariance_type=covariance_type, max_iter=max_iter, random_state=random_state)
                    labels = model.fit_predict(scaled_eval)

                    score = silhouette_score(scaled_eval, labels)

                    # Append results to the list
                    results.append({
                        'file': eval_filename,
                        'n_components': n_components,
                        'covariance_type': covariance_type,
                        'max_iter': max_iter,
                        'random_state': random_state,
                        'silhouette_score': score
                    })

df_results = pd.DataFrame(results)

def concat_parameters(group):
    # This will force even single items into a comma-separated string format

    return ' '.join(group['parameters_combinations'].tolist())

df_results['parameters_combinations'] = df_results.iloc[:, 1:5].apply(lambda row: '"' + ','.join(row.dropna().astype(str)) + '"', axis=1)
df_results = df_results.drop(df_results.columns[1:5], axis=1)
df_results = df_results.sort_values(by='silhouette_score', ascending=False)
df_results['rank'] = df_results.groupby('file')['silhouette_score'].rank(method='first', ascending=False).astype(int)

grouped_combinations = df_results.groupby(['file']).apply(concat_parameters)

output_file = "GMM+parameterTuning.xlsx"
df_results.to_excel(output_file, index=False)


with open('tuning_list_Kemeny_GMM.txt', 'w') as f:
    i = 1
    for _, group_string in grouped_combinations.items():
        f.write(f'A{i} : {group_string}\n')
        i += 1


In [None]:
KemenyRanking('/content/tuning_list_Kemeny_GMM.txt')

In [None]:
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#Tuning parameters for Kmedoids
#-------------------------------------------------------------------------------



import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)

# Columns to be excluded
columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness','label']

eval_files = []
tuning_files = []


position_mapping = {
    0: 'X Position',  # test1
    1: 'Y Position',  # test2
    2: 'X Position',  # test3
    3: 'Y Position'   # test4
}

for idx, file in enumerate(files_p):
    df = pd.read_excel(file, engine='openpyxl')


    position = position_mapping[idx]
    eval_data = df[df[position] < 125]

    eval_filename = f'test_{idx+1}_eval.xlsx'
    eval_data.to_excel(eval_filename, index=False)
    eval_files.append(eval_filename)



    position = position_mapping[idx]
    tuning_data = df[df[position] > 125]

    tuning_filename = f'test_{idx+1}_tuning.xlsx'
    tuning_data.to_excel(tuning_filename, index=False)
    tuning_files.append(tuning_filename)

# KMedoids parameters
param_grid_KMedoids = {
    'n_clusters': [3],
    'init': ['random'],
    'max_iter': [100, 200],
    'random_state': [0, 42 ]
}

results = []
max_silhouette = -1
best_params = {}

for eval_filename in eval_files:
    eval_df = pd.read_excel(eval_filename, engine='openpyxl')
    eval_df = eval_df.drop(columns=columns_to_drop, errors='ignore')
    scaled_eval = StandardScaler().fit_transform(eval_df)

    for n_clusters in param_grid_KMedoids['n_clusters']:
        for init in param_grid_KMedoids['init']:
            for max_iter in param_grid_KMedoids['max_iter']:
                for random_state in param_grid_KMedoids['random_state']:
                    # KMedoids clustering
                    model = KMedoids(n_clusters=n_clusters, init=init, max_iter=max_iter, random_state=random_state)
                    model.fit(scaled_eval)
                    labels = model.labels_

                    score = silhouette_score(scaled_eval, labels)

                    # Append results to the list
                    results.append({
                        'file': eval_filename,
                        'n_clusters': n_clusters,
                        'init': init,
                        'max_iter': max_iter,
                        'random_state': random_state,
                        'silhouette_score': score
                    })

df_results = pd.DataFrame(results)

def concat_parameters(group):
    # This will force even single items into a comma-separated string format

    return ' '.join(group['parameters_combinations'].tolist())

df_results['parameters_combinations'] = df_results.iloc[:, 1:5].apply(lambda row: '"' + ','.join(row.dropna().astype(str)) + '"', axis=1)
df_results = df_results.drop(df_results.columns[1:5], axis=1)
df_results = df_results.sort_values(by='silhouette_score', ascending=False)
df_results['rank'] = df_results.groupby('file')['silhouette_score'].rank(method='first', ascending=False).astype(int)

grouped_combinations = df_results.groupby(['file']).apply(concat_parameters)

output_file = "Kmedoids+parameterTuning.xlsx"
df_results.to_excel(output_file, index=False)


with open('tuning_list_Kemeny_Kmedoids.txt', 'w') as f:
    i = 1
    for _, group_string in grouped_combinations.items():
        f.write(f'A{i} : {group_string}\n')
        i += 1

In [None]:
KemenyRanking('/content/tuning_list_Kemeny_Kmedoids.txt')

In [None]:
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#Tuning parameters for OPTICS
#-------------------------------------------------------------------------------
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import OPTICS
from sklearn.metrics import silhouette_score

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)

# Columns to be excluded
columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness']

eval_files = []
tuning_files = []


position_mapping = {
    0: 'X Position',  # test1
    1: 'Y Position',  # test2
    2: 'X Position',  # test3
    3: 'Y Position'   # test4
}

for idx, file in enumerate(files_p):
    df = pd.read_excel(file, engine='openpyxl')


    position = position_mapping[idx]
    eval_data = df[df[position] < 125]

    eval_filename = f'test_{idx+1}_eval.xlsx'
    eval_data.to_excel(eval_filename, index=False)
    eval_files.append(eval_filename)



    position = position_mapping[idx]
    eval_data = df[df[position] > 125]

    tuning_filename = f'test_{idx+1}_tuning.xlsx'
    tuning_data.to_excel(tuning_filename, index=False)
    tuning_files.append(tuning_filename)

# OPTICS parameters
param_grid_OPTICS = {
    'min_samples': [3 ,5, 10, 15],
    'xi': [0.01,0.03,0.05, 0.1, 0.2]
}

results = []
max_silhouette = -1
best_params = {}

for eval_filename in eval_files:
    eval_df = pd.read_excel(eval_filename, engine='openpyxl')
    eval_df = eval_df.drop(columns=columns_to_drop, errors='ignore')
    scaled_eval = StandardScaler().fit_transform(eval_df)

    for min_samples in param_grid_OPTICS['min_samples']:
        for xi in param_grid_OPTICS['xi']:
            # OPTICS clustering
            model = OPTICS(min_samples=min_samples, xi=xi)
            model.fit(scaled_eval)


            if len(set(model.labels_)) <= 1 or (len(set(model.labels_)) == 2 and -1 in model.labels_):
               continue

            labels = model.labels_
            score = silhouette_score(scaled_eval, labels)

            # Append results to the list
            results.append({
                'file': eval_filename,
                'min_samples': min_samples,
                'xi': xi,
                'silhouette_score': score
            })

df_results = pd.DataFrame(results)

def concat_parameters(group):
    # This will force even single items into a comma-separated string format

    return ' '.join(group['parameters_combinations'].tolist())

df_results['parameters_combinations'] = df_results.iloc[:, 1:3].apply(lambda row: '"' + ','.join(row.dropna().astype(str)) + '"', axis=1)
df_results = df_results.drop(df_results.columns[1:3], axis=1)
df_results = df_results.sort_values(by='silhouette_score', ascending=False)
df_results['rank'] = df_results.groupby('file')['silhouette_score'].rank(method='first', ascending=False).astype(int)

grouped_combinations = df_results.groupby(['file']).apply(concat_parameters)

output_file = "OPTICS+parameterTuning.xlsx"
df_results.to_excel(output_file, index=False)


with open('tuning_list_Kemeny_OPTICS.txt', 'w') as f:
    i = 1
    for _, group_string in grouped_combinations.items():
        f.write(f'A{i} : {group_string}\n')
        i += 1


In [None]:
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#Tuning parameters for BIRCH
#-------------------------------------------------------------------------------

import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)


columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness']

eval_files = []
tuning_files = []


position_mapping = {
    0: 'X Position',  # test1
    1: 'Y Position',  # test2
    2: 'X Position',  # test3
    3: 'Y Position'   # test4
}

for idx, file in enumerate(files_p):
    df = pd.read_excel(file, engine='openpyxl')


    position = position_mapping[idx]
    eval_data = df[df[position] < 125]

    eval_filename = f'test_{idx+1}_eval.xlsx'
    eval_data.to_excel(eval_filename, index=False)
    eval_files.append(eval_filename)



    position = position_mapping[idx]
    eval_data = df[df[position] > 125]

    tuning_filename = f'test_{idx+1}_tuning.xlsx'
    tuning_data.to_excel(tuning_filename, index=False)
    tuning_files.append(tuning_filename)

# BIRCH parameters
param_grid_BIRCH = {
    'n_clusters': [3],
    'threshold': [0.1, 0.2, 0.3],
    'branching_factor': [50, 100, 200]
}

results = []
max_silhouette = -1
best_params = {}

for eval_filename in eval_files:
    eval_df = pd.read_excel(eval_filename, engine='openpyxl')
    eval_df = eval_df.drop(columns=columns_to_drop, errors='ignore')
    scaled_eval = StandardScaler().fit_transform(eval_df)

    for n_clusters in param_grid_BIRCH['n_clusters']:
        for threshold in param_grid_BIRCH['threshold']:
            for branching_factor in param_grid_BIRCH['branching_factor']:
                # BIRCH clustering
                model = Birch(n_clusters=n_clusters, threshold=threshold, branching_factor=branching_factor)
                model.fit(scaled_eval)
                labels = model.labels_

                score = silhouette_score(scaled_eval, labels)

                # Append results to the list
                results.append({
                    'file': eval_filename,
                    'n_clusters': n_clusters,
                    'threshold': threshold,
                    'branching_factor': branching_factor,
                    'silhouette_score': score
                })
df_results = pd.DataFrame(results)

def concat_parameters(group):
    # This will force even single items into a comma-separated string format

    return ' '.join(group['parameters_combinations'].tolist())

df_results['parameters_combinations'] = df_results.iloc[:, 1:3].apply(lambda row: '"' + ','.join(row.dropna().astype(str)) + '"', axis=1)
df_results = df_results.drop(df_results.columns[1:3], axis=1)
df_results = df_results.sort_values(by='silhouette_score', ascending=False)
df_results['rank'] = df_results.groupby('file')['silhouette_score'].rank(method='first', ascending=False).astype(int)

grouped_combinations = df_results.groupby(['file']).apply(concat_parameters)

output_file = "BIRCH+parameterTuning.xlsx"
df_results.to_excel(output_file, index=False)


with open('tuning_list_Kemeny_BIRCH.txt', 'w') as f:
    i = 1
    for _, group_string in grouped_combinations.items():
        f.write(f'A{i} : {group_string}\n')
        i += 1


In [None]:
import pandas as pd

# Step 1: Read the XLSX file
df = pd.read_excel('parameter-tuning.xlsx')

# Step 2: Convert DataFrame to LaTeX
latex_code = df.to_latex(index=False)


with open('output.tex', 'w') as f:
    f.write(latex_code)

print('output.tex')

Generating HMresults * in 4 steps

In [None]:
#-------------------------------------------------------------------------------
# ** generating HM-results (1) **
#-------------------------------------------------------------------------------


!pip install scikit-learn-extra
!pip install fuzzy-c-means
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, OPTICS, Birch
from sklearn_extra.cluster import KMedoids
from sklearn.mixture import GaussianMixture
from fcmeans import FCM
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)
# EXCLUDE *******************************************************
columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness','MODULUS'] # please exclude those features that you dont need them
                                                                                           #First we do the expriments with Hardness alone and then Modulus Alone and then The combination of them
datasets = [pd.read_excel(file, engine='openpyxl').drop(columns_to_drop, axis=1, errors='ignore') for file in files_p]
scaled_datasets = [StandardScaler().fit_transform(df) for df in datasets]

clustering_algorithms = {
    'KMeans': KMeans(n_clusters=3, random_state=0 , init= 'k-means++' , n_init = 10 , max_iter = 100 ),

    'DBSCAN': DBSCAN(eps=0.1 , min_samples = 5),
    'Agglomerative': AgglomerativeClustering(n_clusters=3 , affinity = 'euclidean' ,  linkage = 'ward'),
                     #linkage = {'ward' (default) , 'complete' , 'average' , 'single'}
                     #affinity= {'euclidean' , 'manhattan , 'cosine'}
    'OPTICS': OPTICS(min_samples = 5 , xi = 0.05),
    'KMedoids': KMedoids(n_clusters=3, random_state= 0 , init = 'random' , max_iter = 100),
                     #metric = {c , 'precomputed'}
    'GMM': GaussianMixture(n_components=3, max_iter=100 , covariance_type = 'full'),
    'BIRCH': Birch(n_clusters=3 , branching_factor = 50 , threshold =0.1),

    'FCM': FCM(n_clusters=3 , m = 2)
}

metrics = {
    'Silhouette': silhouette_score,
    'Calinski-Harabasz': calinski_harabasz_score,
    'Davies-Bouldin': davies_bouldin_score
}

results = []

for index, (scaled_data, file_name) in enumerate(zip(scaled_datasets, files_p)):
    dataset_label = 'D' + str(index + 1)
    for algorithm_name, algorithm in clustering_algorithms.items():

        if algorithm_name == 'GMM':
            cluster_labels = algorithm.fit_predict(scaled_data)
        elif algorithm_name == 'FCM':
            algorithm.fit(scaled_data)
            cluster_labels = algorithm.u.argmax(axis=1)
        else:
            cluster_labels = algorithm.fit_predict(scaled_data)

        if len(set(cluster_labels)) > 1:
            for metric_name, metric_func in metrics.items():
                score = metric_func(scaled_data, cluster_labels)
                results.append({'Metric': metric_name,
                                'Dataset': dataset_label,
                                'Method': algorithm_name,
                                'Score': score})

results_df = pd.DataFrame(results)
grouped = results_df.groupby(['Metric', 'Dataset', 'Method']).Score.mean().unstack()


if os.path.exists("HMresults.xlsx"):
    os.remove("HMresults.xlsx")

grouped.to_excel("HMresults.xlsx")
print(grouped.to_latex())







In [None]:
#-------------------------------------------------------------------------------
# ** generating HM-results (2) **
#-------------------------------------------------------------------------------



import pandas as pd
import numpy as np


df = pd.read_excel("HMresults.xlsx", engine='openpyxl')


metric_columns = df.columns.difference(['Metric', 'Dataset'])


updated_rows = pd.DataFrame(columns=df.columns)


for idx, row in df.iterrows():

    updated_rows = updated_rows.append(row)


    if row['Dataset'] == 'D4':

        last_4_rows = df.iloc[idx-3:idx+1][metric_columns]
        avg_values = last_4_rows.mean()


        avg_row_data = row.to_dict()
        avg_row_data.update(avg_values)
        avg_row_data['Dataset'] = 'Avg'


        updated_rows = updated_rows.append(avg_row_data, ignore_index=True)


df_updated = updated_rows.reset_index(drop=True)
df_updated.to_excel("HMresults2.xlsx", index=False)


print(df_updated)

In [None]:

#-------------------------------------------------------------------------------
# ** generating HM-results (3) **
#-------------------------------------------------------------------------------


import pandas as pd


df = pd.read_excel("HMresults2.xlsx", engine='openpyxl', index_col=[0, 1])


averages = df.groupby(level=0).mean()


for metric in averages.index:
    df.loc[(metric, 'Avg'), :] = averages.loc[metric]

# Determine rankings for the averages. Higher is better for Silhouette and Calinski-Harabasz,
# while lower is better for Davies-Bouldin.
metrics = df.index.get_level_values(0).unique()
for metric in metrics:
    if metric in ["Silhouette", "Calinski-Harabasz"]:
        rank = df.loc[(metric, 'Avg')].rank(ascending=False).astype(int)
    else:
        rank = df.loc[(metric, 'Avg')].rank(ascending=True).astype(int)
    df.loc[(metric, 'Rank'), :] = rank


order = ['D1', 'D2', 'D3', 'D4', 'Avg', 'Rank']
sorted_tuples = sorted(df.index, key=lambda x: (metrics.tolist().index(x[0]), order.index(x[1])))
df = df.reindex(sorted_tuples)


df.to_excel("HMresults.xlsx")
print(df.to_latex())


print(df)

In [None]:
#-------------------------------------------------------------------------------
# ** generating HM-results (4) **
#-------------------------------------------------------------------------------

import pandas as pd
import matplotlib.pyplot as plt

# Sample Data loading
df = pd.read_excel("HMresults.xlsx", engine='openpyxl', index_col=[0, 1])
rankings = df.xs('Rank', level=1)

colors = {
    'Silhouette': 'green',
    'Davies-Bouldin': 'blue',
    'Calinski-Harabasz': 'orange'
}

algorithms_order = ['Agglomerative', 'BIRCH', 'DBSCAN', 'FCM', 'GMM', 'KMeans', 'KMedoids', 'OPTICS']

fig, axes = plt.subplots(nrows=1, ncols=len(rankings), figsize=(15, 5))


plt.rcParams.update({'font.size': 12, 'font.weight': 'bold'})

for ax, (metric, data) in zip(axes, rankings.iterrows()):
    # Re-order the rankings such that 8 is at the bottom and 1 at the top
    visual_rank = 9 - data.reindex(algorithms_order)  # 9 - rank to invert the bars
    bars = ax.bar(visual_rank.index, visual_rank, color=colors[metric], width=0.5)

    highest_rank_bar = data.idxmin()

    # Highlight the highest rank bar with a red rectangle
    for bar in bars:
        if 9 - bar.get_height() == 1:  # find the rank 1 bar
            rect = plt.Rectangle((bar.get_x() - 0.1, 0), bar.get_width() + 0.2, bar.get_height(), fill=False, edgecolor='red', linewidth=1.5)
            ax.add_patch(rect)
            break

    ax.set_ylabel('Rank', fontsize=14, fontweight='bold')
    ax.set_ylim(0, 9)  # Set the y-axis limits
    ax.set_yticks(range(1, 9))
    ax.set_yticklabels(['8', '7', '6', '5', '4', '3', '2', '1'])  # Explicitly setting the y-tick labels
    ax.set_title(metric, fontweight='bold')
    ax.set_xticklabels(visual_rank.index, rotation=45, ha='right', fontweight='bold')

# Improve layout for better presentation
plt.tight_layout()

# High resolution saving for academic papers
plt.savefig('HMresults__evauation3.png', dpi=300)

# Show the plot
plt.show()

***Hresuts.xlsx (in 4 steps)***

In [None]:
#-------------------------------------------------------------------------------
# ** generating H-results (1) **
#-------------------------------------------------------------------------------


!pip install scikit-learn-extra
!pip install fuzzy-c-means
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, OPTICS, Birch
from sklearn_extra.cluster import KMedoids
from sklearn.mixture import GaussianMixture
from fcmeans import FCM
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)
# EXCLUDE *******************************************************
columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness', 'MODULUS'] # please exclude those features that you dont need them
                                                                                           #First we do the expriments with Hardness alone and then Modulus Alone and then The combination of them
datasets = [pd.read_excel(file, engine='openpyxl').drop(columns_to_drop, axis=1, errors='ignore') for file in files_p]
scaled_datasets = [StandardScaler().fit_transform(df) for df in datasets]

clustering_algorithms = {
    'KMeans': KMeans(n_clusters=3, random_state=0 , init= 'k-means++' , n_init = 10 , max_iter = 100 ),

    'DBSCAN': DBSCAN(eps=0.1 , min_samples = 5),
    'Agglomerative': AgglomerativeClustering(n_clusters=3 , affinity = 'euclidean' ,  linkage = 'ward'),
                     #linkage = {'ward' (default) , 'complete' , 'average' , 'single'}
                     #affinity= {'euclidean' , 'manhattan , 'cosine'}
    'OPTICS': OPTICS(min_samples = 5 , xi = 0.05),
    'KMedoids': KMedoids(n_clusters=3, random_state= 0 , init = 'random' , max_iter = 100),
                     #metric = {c , 'precomputed'}
    'GMM': GaussianMixture(n_components=3, max_iter=100 , covariance_type = 'full'),
    'BIRCH': Birch(n_clusters=3 , branching_factor = 50 , threshold =0.1),

    'FCM': FCM(n_clusters=3 , m = 2)
}

metrics = {
    'Silhouette': silhouette_score,
    'Calinski-Harabasz': calinski_harabasz_score,
    'Davies-Bouldin': davies_bouldin_score
}

results = []

for index, (scaled_data, file_name) in enumerate(zip(scaled_datasets, files_p)):
    dataset_label = 'D' + str(index + 1)
    for algorithm_name, algorithm in clustering_algorithms.items():

        if algorithm_name == 'GMM':
            cluster_labels = algorithm.fit_predict(scaled_data)
        elif algorithm_name == 'FCM':
            algorithm.fit(scaled_data)
            cluster_labels = algorithm.u.argmax(axis=1)
        else:
            cluster_labels = algorithm.fit_predict(scaled_data)

        if len(set(cluster_labels)) > 1:
            for metric_name, metric_func in metrics.items():
                score = metric_func(scaled_data, cluster_labels)
                results.append({'Metric': metric_name,
                                'Dataset': dataset_label,
                                'Method': algorithm_name,
                                'Score': score})

results_df = pd.DataFrame(results)
grouped = results_df.groupby(['Metric', 'Dataset', 'Method']).Score.mean().unstack()


if os.path.exists("Hresults.xlsx"):
    os.remove("Hresults.xlsx")

grouped.to_excel("Hresults.xlsx")
print(grouped.to_latex())


In [None]:


#-------------------------------------------------------------------------------
# ** generating H-results (2) **
#-------------------------------------------------------------------------------



import pandas as pd
import numpy as np


df = pd.read_excel("Hresults.xlsx", engine='openpyxl')


metric_columns = df.columns.difference(['Metric', 'Dataset'])


updated_rows = pd.DataFrame(columns=df.columns)


for idx, row in df.iterrows():

    updated_rows = updated_rows.append(row)


    if row['Dataset'] == 'D4':

        last_4_rows = df.iloc[idx-3:idx+1][metric_columns]
        avg_values = last_4_rows.mean()


        avg_row_data = row.to_dict()
        avg_row_data.update(avg_values)
        avg_row_data['Dataset'] = 'Avg'


        updated_rows = updated_rows.append(avg_row_data, ignore_index=True)


df_updated = updated_rows.reset_index(drop=True)
df_updated.to_excel("Hresults2.xlsx", index=False)


print(df_updated)

In [None]:

#-------------------------------------------------------------------------------
# ** generating H-results (3) **
#-------------------------------------------------------------------------------



import pandas as pd


df = pd.read_excel("Hresults2.xlsx", engine='openpyxl', index_col=[0, 1])


averages = df.groupby(level=0).mean()


for metric in averages.index:
    df.loc[(metric, 'Avg'), :] = averages.loc[metric]

# Determine rankings for the averages. Higher is better for Silhouette and Calinski-Harabasz,
# while lower is better for Davies-Bouldin.
metrics = df.index.get_level_values(0).unique()
for metric in metrics:
    if metric in ["Silhouette", "Calinski-Harabasz"]:
        rank = df.loc[(metric, 'Avg')].rank(ascending=False).astype(int)
    else:
        rank = df.loc[(metric, 'Avg')].rank(ascending=True).astype(int)
    df.loc[(metric, 'Rank'), :] = rank


order = ['D1', 'D2', 'D3', 'D4', 'Avg', 'Rank']
sorted_tuples = sorted(df.index, key=lambda x: (metrics.tolist().index(x[0]), order.index(x[1])))
df = df.reindex(sorted_tuples)


df.to_excel("Hresults.xlsx")
print(df.to_latex())


print(df)

In [None]:

#-------------------------------------------------------------------------------
# ** generating H-results (4) **
#-------------------------------------------------------------------------------



import pandas as pd
import matplotlib.pyplot as plt

# Sample Data loading
df = pd.read_excel("Hresults.xlsx", engine='openpyxl', index_col=[0, 1])
rankings = df.xs('Rank', level=1)

colors = {
    'Silhouette': 'green',
    'Davies-Bouldin': 'blue',
    'Calinski-Harabasz': 'orange'
}

algorithms_order = ['Agglomerative', 'BIRCH', 'DBSCAN', 'FCM', 'GMM', 'KMeans', 'KMedoids', 'OPTICS']

fig, axes = plt.subplots(nrows=1, ncols=len(rankings), figsize=(15, 5))


plt.rcParams.update({'font.size': 12, 'font.weight': 'bold'})

for ax, (metric, data) in zip(axes, rankings.iterrows()):
    # Re-order the rankings such that 8 is at the bottom and 1 at the top
    visual_rank = 9 - data.reindex(algorithms_order)  # 9 - rank to invert the bars
    bars = ax.bar(visual_rank.index, visual_rank, color=colors[metric], width=0.5)

    highest_rank_bar = data.idxmin()

    # Highlight the highest rank bar with a red rectangle
    for bar in bars:
        if 9 - bar.get_height() == 1:  # find the rank 1 bar
            rect = plt.Rectangle((bar.get_x() - 0.1, 0), bar.get_width() + 0.2, bar.get_height(), fill=False, edgecolor='red', linewidth=1.5)
            ax.add_patch(rect)
            break

    ax.set_ylabel('Rank', fontsize=14, fontweight='bold')
    ax.set_ylim(0, 9)  # Set the y-axis limits
    ax.set_yticks(range(1, 9))
    ax.set_yticklabels(['8', '7', '6', '5', '4', '3', '2', '1'])  # Explicitly setting the y-tick labels
    ax.set_title(metric, fontweight='bold')
    ax.set_xticklabels(visual_rank.index, rotation=45, ha='right', fontweight='bold')


plt.tight_layout()


plt.savefig('Hresults__evauation3.png', dpi=300)
plt.show()

Mresults.xlsx (in 4 steps)

In [None]:
#-------------------------------------------------------------------------------
# ** generating M-results (1) **
#-------------------------------------------------------------------------------


!pip install scikit-learn-extra
!pip install fuzzy-c-means
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, OPTICS, Birch
from sklearn_extra.cluster import KMedoids
from sklearn.mixture import GaussianMixture
from fcmeans import FCM
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)
# EXCLUDE *******************************************************
columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness', 'HARDNESS'] # please exclude those features that you dont need them
                                                                                           #First we do the expriments with Hardness alone and then Modulus Alone and then The combination of them
datasets = [pd.read_excel(file, engine='openpyxl').drop(columns_to_drop, axis=1, errors='ignore') for file in files_p]
scaled_datasets = [StandardScaler().fit_transform(df) for df in datasets]

clustering_algorithms = {
    'KMeans': KMeans(n_clusters=3, random_state=0 , init= 'k-means++' , n_init = 10 , max_iter = 100 ),

    'DBSCAN': DBSCAN(eps=0.1 , min_samples = 5),
    'Agglomerative': AgglomerativeClustering(n_clusters=3 , affinity = 'euclidean' ,  linkage = 'ward'),
                     #linkage = {'ward' (default) , 'complete' , 'average' , 'single'}
                     #affinity= {'euclidean' , 'manhattan , 'cosine'}
    'OPTICS': OPTICS(min_samples = 5 , xi = 0.05),
    'KMedoids': KMedoids(n_clusters=3, random_state= 0 , init = 'random' , max_iter = 100),
                     #metric = {c , 'precomputed'}
    'GMM': GaussianMixture(n_components=3, max_iter=100 , covariance_type = 'full'),
    'BIRCH': Birch(n_clusters=3 , branching_factor = 50 , threshold =0.1),

    'FCM': FCM(n_clusters=3 , m = 2)
}

metrics = {
    'Silhouette': silhouette_score,
    'Calinski-Harabasz': calinski_harabasz_score,
    'Davies-Bouldin': davies_bouldin_score
}

results = []

for index, (scaled_data, file_name) in enumerate(zip(scaled_datasets, files_p)):
    dataset_label = 'D' + str(index + 1)
    for algorithm_name, algorithm in clustering_algorithms.items():

        if algorithm_name == 'GMM':
            cluster_labels = algorithm.fit_predict(scaled_data)
        elif algorithm_name == 'FCM':
            algorithm.fit(scaled_data)
            cluster_labels = algorithm.u.argmax(axis=1)
        else:
            cluster_labels = algorithm.fit_predict(scaled_data)

        if len(set(cluster_labels)) > 1:
            for metric_name, metric_func in metrics.items():
                score = metric_func(scaled_data, cluster_labels)
                results.append({'Metric': metric_name,
                                'Dataset': dataset_label,
                                'Method': algorithm_name,
                                'Score': score})

results_df = pd.DataFrame(results)
grouped = results_df.groupby(['Metric', 'Dataset', 'Method']).Score.mean().unstack()


if os.path.exists("Mresults.xlsx"):
    os.remove("Mresults.xlsx")

grouped.to_excel("Mresults.xlsx")
print(grouped.to_latex())

In [None]:
#-------------------------------------------------------------------------------
# ** generating M-results (2) **
#-------------------------------------------------------------------------------

import pandas as pd
import numpy as np


df = pd.read_excel("Mresults.xlsx", engine='openpyxl')


metric_columns = df.columns.difference(['Metric', 'Dataset'])


updated_rows = pd.DataFrame(columns=df.columns)


for idx, row in df.iterrows():

    updated_rows = updated_rows.append(row)


    if row['Dataset'] == 'D4':

        last_4_rows = df.iloc[idx-3:idx+1][metric_columns]
        avg_values = last_4_rows.mean()


        avg_row_data = row.to_dict()
        avg_row_data.update(avg_values)
        avg_row_data['Dataset'] = 'Avg'


        updated_rows = updated_rows.append(avg_row_data, ignore_index=True)


df_updated = updated_rows.reset_index(drop=True)
df_updated.to_excel("Mresults2.xlsx", index=False)


print(df_updated)

In [None]:
#-------------------------------------------------------------------------------
# ** generating M-results (3) **
#-------------------------------------------------------------------------------

import pandas as pd


df = pd.read_excel("Mresults2.xlsx", engine='openpyxl', index_col=[0, 1])


averages = df.groupby(level=0).mean()


for metric in averages.index:
    df.loc[(metric, 'Avg'), :] = averages.loc[metric]

# Determine rankings for the averages. Higher is better for Silhouette and Calinski-Harabasz,
# while lower is better for Davies-Bouldin.
metrics = df.index.get_level_values(0).unique()
for metric in metrics:
    if metric in ["Silhouette", "Calinski-Harabasz"]:
        rank = df.loc[(metric, 'Avg')].rank(ascending=False).astype(int)
    else:
        rank = df.loc[(metric, 'Avg')].rank(ascending=True).astype(int)
    df.loc[(metric, 'Rank'), :] = rank


order = ['D1', 'D2', 'D3', 'D4', 'Avg', 'Rank']
sorted_tuples = sorted(df.index, key=lambda x: (metrics.tolist().index(x[0]), order.index(x[1])))
df = df.reindex(sorted_tuples)


df.to_excel("Mresults.xlsx")
print(df.to_latex())


print(df)

In [None]:
#-------------------------------------------------------------------------------
# ** generating M-results (4) **
#-------------------------------------------------------------------------------

import pandas as pd
import matplotlib.pyplot as plt

# Sample Data loading
df = pd.read_excel("Mresults.xlsx", engine='openpyxl', index_col=[0, 1])
rankings = df.xs('Rank', level=1)

colors = {
    'Silhouette': 'green',
    'Davies-Bouldin': 'blue',
    'Calinski-Harabasz': 'orange'
}

algorithms_order = ['Agglomerative', 'BIRCH', 'DBSCAN', 'FCM', 'GMM', 'KMeans', 'KMedoids', 'OPTICS']

fig, axes = plt.subplots(nrows=1, ncols=len(rankings), figsize=(15, 5))


plt.rcParams.update({'font.size': 12, 'font.weight': 'bold'})

for ax, (metric, data) in zip(axes, rankings.iterrows()):
    # Re-order the rankings such that 8 is at the bottom and 1 at the top
    visual_rank = 9 - data.reindex(algorithms_order)  # 9 - rank to invert the bars
    bars = ax.bar(visual_rank.index, visual_rank, color=colors[metric], width=0.5)

    highest_rank_bar = data.idxmin()

    # Highlight the highest rank bar with a red rectangle
    for bar in bars:
        if 9 - bar.get_height() == 1:  # find the rank 1 bar
            rect = plt.Rectangle((bar.get_x() - 0.1, 0), bar.get_width() + 0.2, bar.get_height(), fill=False, edgecolor='red', linewidth=1.5)
            ax.add_patch(rect)
            break

    ax.set_ylabel('Rank', fontsize=14, fontweight='bold')
    ax.set_ylim(0, 9)  # Set the y-axis limits
    ax.set_yticks(range(1, 9))
    ax.set_yticklabels(['8', '7', '6', '5', '4', '3', '2', '1'])  # Explicitly setting the y-tick labels
    ax.set_title(metric, fontweight='bold')
    ax.set_xticklabels(visual_rank.index, rotation=45, ha='right', fontweight='bold')


plt.tight_layout()


plt.savefig('Mresults__evauation3.png', dpi=300)


plt.show()

Kenemy Method for Aggregation

In [None]:
#-------------------------------------------------------------------------------
# --- Knemy - Young Method -----
#-------------------------------------------------------------------------------




import itertools
import pandas as pd
import matplotlib.pyplot as plt

def kendall_tau_distance(rank_A, rank_B):
    """Calculate the number of pairwise disagreements between two rankings."""
    n = len(rank_A)
    concordant = 0
    discordant = 0

    for i in range(n):
        for j in range(i+1, n):
            a = rank_A[i] - rank_A[j]
            b = rank_B[i] - rank_B[j]

            if a * b > 0:
                concordant += 1
            elif a * b < 0:
                discordant += 1

    return discordant - concordant

def kemeny_young_rankings(rankings):
    """Compute the Kemeny-Young method rankings."""
    aggregate_rankings = []
    all_possible_rankings = list(itertools.permutations(rankings[0]))
    for ranking in all_possible_rankings:
        distance = sum(kendall_tau_distance(list(ranking), list(rank)) for rank in rankings)
        aggregate_rankings.append((ranking, distance))
    # The optimal ranking is the one with the smallest total distance
    ky_ranking = min(aggregate_rankings, key=lambda x: x[1])[0]
    return ky_ranking

xls = pd.ExcelFile('aggregate-ranking.xlsx')

# Titles mapping
title_map = {
    "HM-ranking": "(c)Hardness+Modulus",
    "H-ranking": "(a)Hardness",
    "M-ranking": "(b)Modulus"
}

# Desired order for plotting
sheet_order = ["H-ranking", "M-ranking", "HM-ranking"]

# Only consider the sheets in the desired order
sheet_names = [sheet for sheet in sheet_order if sheet in xls.sheet_names]

# Ordering of the algorithms
algorithm_order = ["Agglomerative", "BIRCH", "DBSCAN", "FCM", "GMM", "KMeans", "KMedoids", "OPTICS"]

fig, axes = plt.subplots(nrows=1, ncols=len(sheet_names), figsize=(15, 5))
plt.rcParams.update({'font.size': 12, 'font.weight': 'bold'})


with pd.ExcelWriter('aggregate-ranking.xlsx', engine='openpyxl', mode='a') as writer:
    for ax, sheet_name in zip(axes, sheet_names):
        sheet_data = xls.parse(sheet_name)
        rankings = sheet_data.iloc[1:].values.tolist()
        aggregate_ranking = kemeny_young_rankings(rankings)
        print(f'For {sheet_name}, the aggregated rank is {aggregate_ranking}')


        aggregate_df = pd.DataFrame({"Algorithm": algorithm_order, "Aggregate Rank": aggregate_ranking})
        aggregate_df.to_excel(writer, sheet_name=f"{sheet_name}_AggregateRank", index=False)


        visual_data = pd.Series(aggregate_ranking, index=algorithm_order)
        visual_rank = 9 - visual_data.reindex(algorithm_order)

        bars = ax.bar(visual_rank.index, visual_rank, color='blue', width=0.5)
        for bar in bars:
            if 9 - bar.get_height() == 1:
                rect = plt.Rectangle((bar.get_x() - 0.1, 0), bar.get_width() + 0.2, bar.get_height(), fill=False, edgecolor='red', linewidth=1.5)
                ax.add_patch(rect)
                break

        ax.set_ylabel('Rank', fontsize=14, fontweight='bold')
        ax.set_ylim(0, 9)
        ax.set_yticks(range(1, 9))
        ax.set_yticklabels(['8', '7', '6', '5', '4', '3', '2', '1'])

        # Set the title using the title_map dictionary
        ax.set_title(title_map.get(sheet_name, sheet_name), fontweight='bold')

        ax.set_xticklabels(visual_rank.index, rotation=45, ha='right', fontweight='bold')

    plt.tight_layout()
    plt.savefig('aggregate_ranking_evaluation.png', dpi=300)
    plt.show()

xls = pd.ExcelFile('aggregate-ranking.xlsx')


aggregate_sheets = ['H-ranking_AggregateRank', 'M-ranking_AggregateRank', 'HM-ranking_AggregateRank']
aggregate_rankings = []

for sheet in aggregate_sheets:
    data = xls.parse(sheet)
    ranking = data['Aggregate Rank'].tolist()
    aggregate_rankings.append(ranking)


aggregate_sheets = ['H-ranking_AggregateRank', 'M-ranking_AggregateRank', 'HM-ranking_AggregateRank']
aggregate_rankings = []

for sheet in aggregate_sheets:
    data = xls.parse(sheet)
    ranking = data['Aggregate Rank'].tolist()
    aggregate_rankings.append(ranking)

# Find the final order using the Kemeny-Young method
final_order = kemeny_young_rankings(aggregate_rankings)
print(f'The final aggregated rank is {final_order}')

# Visualize the final order
fig, ax = plt.subplots(figsize=(8, 5))
plt.rcParams.update({'font.size': 12, 'font.weight': 'bold'})

visual_data = pd.Series(final_order, index=algorithm_order)
visual_rank = 9 - visual_data.reindex(algorithm_order)

bars = ax.bar(visual_rank.index, visual_rank, color='blue', width=0.5)
for bar in bars:
    if 9 - bar.get_height() == 1:
        rect = plt.Rectangle((bar.get_x() - 0.1, 0), bar.get_width() + 0.2, bar.get_height(), fill=False, edgecolor='red', linewidth=1.5)
        ax.add_patch(rect)
        break

ax.set_ylabel('Rank', fontsize=14, fontweight='bold')
ax.set_ylim(0, 9)
ax.set_yticks(range(1, 9))
ax.set_yticklabels(['8', '7', '6', '5', '4', '3', '2', '1'])
ax.set_title('Final Ranking', fontweight='bold')
ax.set_xticklabels(visual_rank.index, rotation=45, ha='right', fontweight='bold')

plt.tight_layout()
plt.savefig('final_ranking_evaluation.png', dpi=300)
plt.show()


In [None]:
import pandas as pd
from itertools import permutations
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

def kendall_tau_distance(rank_A, rank_B):
    n = len(rank_A)
    pairs = [(i, j) for i in range(n) for j in range(i+1, n)]
    disagreements = 0
    for x, y in pairs:
        a = rank_A[x] - rank_A[y]
        b = rank_B[x] - rank_B[y]
        if a * b < 0:
            disagreements += 1
    return disagreements

def kemeny_young(rankings):
    n = len(rankings[0])
    min_distance = float('inf')
    best_ranking = None
    for candidate in permutations(range(n)):
        total_distance = sum(kendall_tau_distance(candidate, rank) for rank in rankings)
        if total_distance < min_distance:
            min_distance = total_distance
            best_ranking = candidate
    return best_ranking

df = pd.read_excel("Hresults.xlsx", engine='openpyxl', index_col=[0, 1])
rankings_df = df.xs('Rank', level=1)
methods_order = ["Agglomerative", "BIRCH", "DBSCAN","GMM", "FCM", "KMeans", "KMedoids", "OPTICS" ]
rankings = [list(row[method] for method in methods_order) for _, row in rankings_df.iterrows()]
aggregate_ranking = [methods_order[i] for i in kemeny_young(rankings)]


height_dict = {method: 9 - idx for idx, method in enumerate(aggregate_ranking)}


bar_heights = [height_dict[method] for method in methods_order]

hardness_results_df = pd.DataFrame({
    'Clustering Methods': methods_order,
    'Rank': [9 - height_dict[method] for method in methods_order]
})


hardness_results_df = hardness_results_df.sort_values(by='Rank')


hardness_results_df.to_excel("hardness_ranking_result.xlsx", index=False)


fig, ax = plt.subplots()
ax.bar(range(len(methods_order)), bar_heights, color='blue')
ax.set_xticks(range(len(methods_order)))
ax.set_xticklabels(methods_order, rotation=90)

max_height_idx = bar_heights.index(max(bar_heights))
rect_width = 0.95
rect_x_start = max_height_idx - (rect_width / 2)
rect = Rectangle((rect_x_start, 0), rect_width, max(bar_heights), linewidth=1.5, edgecolor='r', facecolor='none', linestyle='dotted')
ax.add_patch(rect)


ax.set_xlabel('Clustering Methods')
ax.set_ylabel('Rank')
ax.set_title('(a) Hardness')
plt.show()

In [None]:
import pandas as pd

# Load the ranking results for hardness, modulus, and hm
hardness_df = pd.read_excel('hardness_rank_results.xlsx')
modulus_df = pd.read_excel('modulus_rank_results.xlsx')
hm_df = pd.read_excel('hm_rank_results.xlsx')

# Transpose each dataframe so that the clustering methods become columns
hardness_df = hardness_df.T
modulus_df = modulus_df.T
hm_df = hm_df.T

# Extract the ranks for each attribute into a new dataframe
data = {
    hardness_df.columns[0]: hardness_df.iloc[0].values,
    modulus_df.columns[0]: modulus_df.iloc[0].values,
    hm_df.columns[0]: hm_df.iloc[0].values
}

combined_df = pd.DataFrame(data)

# Set the index to the names of the clustering methods
combined_df.index = hardness_df.index

# Transpose the final dataframe so that clustering methods are column headers
combined_df = combined_df.transpose()

combined_df.to_excel('ready_for_ranking.xlsx')



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def get_values_for_metric(filename, metric_row):
    """Read the file, and return values for the given metric from the specified row."""
    data = pd.read_excel(filename)


    values = data.iloc[metric_row, 2:10].values
    return values

files = ['Hresults.xlsx', 'HMresults.xlsx', 'Mresults.xlsx']
colors = ['blue', 'red', 'black']
label_mapper = {
    'Hresults': 'Hardness',
    'HMresults': 'Hardness + Modulus',
    'Mresults': 'Modulus'
}
file_labels = [label_mapper[filename.split('.')[0]] for filename in files]


data_sample = pd.read_excel(files[0])
cluster_names = data_sample.columns[2:10]

metrics = {
    'Silhouette': 16,
    'Calinski-Harabasz': 4,
    'Davies-Bouldin': 10
}

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 6), constrained_layout=True)

for ax, (metric_name, metric_row) in zip(axs, metrics.items()):

    for filename, color, label in zip(files, colors, file_labels):
        values = get_values_for_metric(filename, metric_row)
        ax.plot(cluster_names, values, marker='o', color=color, linewidth=2, label=label)

    ax.set_title(metric_name)
    ax.set_xticklabels(cluster_names, rotation=45, fontsize=10, fontname='sans')
    ax.grid(True)


handles, labels = axs[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=3)

plt.show()

In [None]:
#---------------------------------
#----------------------------------
#----------------------------------


import matplotlib.pyplot as plt
from itertools import permutations

def kendall_tau_distance(rank_a, rank_b):
    """Calculate the Kendall-Tau distance between two rankings."""
    pairs = [(a, b) for idx, a in enumerate(rank_a) for b in rank_a[idx + 1:]]
    distance = 0
    for x, y in pairs:
        a = rank_a.index(x) - rank_a.index(y)
        b = rank_b.index(x) - rank_b.index(y)
        if a * b < 0:
            distance += 1
    return distance

def kemeny_young(rankings):
    """Compute the Kemeny-Young optimal aggregation ranking."""
    min_distance = float('inf')
    best_ranking = None

    for candidate in permutations(rankings[0]):
        distance = sum(kendall_tau_distance(candidate, rank) for rank in rankings)
        if distance < min_distance:
            min_distance = distance
            best_ranking = candidate

    return best_ranking

# Evaluation ranking for HARDNESS / Calinki - Davies - Silhouette
rankings_Hardness = [
    ['KMeans', 'FCM', 'KMedoids' , 'Agglomerative', 'GMM', 'BIRCH' , 'DBSCAN' , 'OPTICS'],
    ['BIRCH', 'KMeans', 'Agglomerative', 'FCM' , 'KMedoids' , 'GMM', 'DBSCAN' , 'OPTICS'],
    ['DBSCAN', 'BIRCH' , 'KMeans' , 'FCM' , 'GMM' , 'Agglomerative' , 'KMedoids' , 'OPTICS']
]

print(" final ranking for HARDNESS is : ")
print(kemeny_young(rankings_Hardness))




# Evaluation ranking for MODULUS / Calinki - Davies - Silhouette
rankings_Modulus = [
    ['KMeans', 'FCM', 'KMedoids' , 'Agglomerative', 'GMM', 'BIRCH' , 'DBSCAN' , 'OPTICS'],
    ['BIRCH', 'KMeans', 'Agglomerative', 'FCM' , 'KMedoids' , 'GMM', 'DBSCAN' , 'OPTICS'],
    ['DBSCAN', 'KMeans' , 'FCM' , 'BIRCH' ,  'GMM' , 'Agglomerative' , 'KMedoids' , 'OPTICS']
]

print(" finalranking for MODULUS is : ")
print(kemeny_young(rankings_Modulus))






# Evaluation ranking for MODULUS / Calinki - Davies - Silhouette
rankings_HM = [
    ['KMeans', 'FCM', 'KMedoids' , 'Agglomerative', 'GMM', 'BIRCH' , 'DBSCAN' , 'OPTICS'],
    ['BIRCH', 'Agglomerative', 'KMeans','FCM' , 'KMedoids' , 'GMM', 'DBSCAN' , 'OPTICS'],
    ['GMM', 'KMeans' , 'Agglomerative' , 'BIRCH' ,  'FCM' , 'KMedoids' , 'DBSCAN', 'OPTICS']
]

print(" finalranking for HM is : ")
print(kemeny_young(rankings_HM))
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

def plot_ranking(rankings, titles):
    # Define the fixed order
    order = ['Agglomerative', 'BIRCH', 'DBSCAN', 'FCM', 'GMM', 'KMeans', 'KMedoids', 'OPTICS']

    fig, axes = plt.subplots(3, 1, figsize=(6, 18))

    for idx, (ranking, metric) in enumerate(zip(rankings, titles)):
        ax = axes[idx]
        # Compute the ranking indices based on the provided order
        indices = [8 - ranking.index(method) for method in order]  # adjust the order to match the y-axis labels

        ax.bar(order, indices, color='blue')
       # ax.set_xlabel('Clustering Methods', fontweight='bold')
        ax.set_title(metric, fontweight='bold')
        ax.set_xticks(order)  # Set x-ticks based on clustering methods
        ax.set_xticklabels(order, rotation=45, ha='right', fontweight='bold')
        ax.set_ylabel('Rank', fontsize=14, fontweight='bold')
        ax.set_ylim(0, 9)  # Set the y-axis limits
        ax.set_yticks(range(1, 9))
        ax.set_yticklabels(['8', '7', '6', '5', '4', '3', '2', '1'])  # Explicitly setting the y-tick labels
        ax.grid(axis='y')

    plt.tight_layout()
    plt.savefig('aggregate_results.png', dpi=500)
    plt.show()

# List of rankings
all_rankings = [
    kemeny_young(rankings_HM),
    kemeny_young(rankings_Hardness),
    kemeny_young(rankings_Modulus)

]

# Titles for each ranking plot
all_titles = ["(a)Ranking for Hardness & Modulus", "(b)Ranking for HARDNESS", "(c)Ranking for MODULUS", ]

# Plot the rankings
plot_ranking(all_rankings, all_titles)



# Step 3: Apply the Kemeny-Young method to the aggregated list to get the final consensus ranking
final_ranking = kemeny_young(all_rankings)

print("Aggregated Kemeny-Young Ranking:")
print(final_ranking)
plot_single_ranking(final_ranking, "Aggregated Kemeny-Young Ranking")