In [8]:
#-------------------------------------------------------------------------------
#Method for Preprocessing datasetsand replacing nulls :
#-------------------------------------------------------------------------------

import pandas as pd
import os
import numpy as np
def processing_procedure(files):
    if not os.path.exists("precessed_datasets"):
        os.mkdir("precessed_datasets")
    processed_files = []
    for filename in files:
        processed_filename = "precessed_datasets/"+filename.split('.')[0] + '_processed.' + filename.split('.')[1]
        if os.path.exists(processed_filename):
            print(f"{processed_filename} already exists.")
        else :
            df = pd.read_excel("datasets/"+filename, engine='openpyxl')

            # Interpolate using the nearest method
            df.interpolate(method='nearest', inplace=True, limit_direction='both')
            
            # Fill remaining NaNs with the mean of the column
            df.fillna(df.mean(), inplace=True)

            

            

            # Save (or overwrite) the processed data into the new spreadsheet.
            df.to_excel(processed_filename, index=False)

        processed_files.append(processed_filename)

    return processed_files

def print_rankings(dist, aggr,cols):
    print("A Kemeny-Young aggregation with score {} is:\n {}".format(
    dist,
    "\n ".join(cols[i] for i in np.argsort(aggr))))
    
    
def get_array_rankings(aggr, cols):
    rankings = [cols[i] for i in np.argsort(aggr)]
    return rankings

In [2]:
import numpy as np
from itertools import combinations, permutations



def kendalltau_dist(rank_a, rank_b):
    tau = 0
    n_candidates = len(rank_a)
    for i, j in combinations(range(n_candidates), 2):
        tau += (np.sign(rank_a[i] - rank_a[j]) ==
                -np.sign(rank_b[i] - rank_b[j]))
    return tau


def rankaggr_brute(ranks):
    min_dist = np.inf
    best_rank = None
    n_voters, n_candidates = ranks.shape
    for candidate_rank in permutations(range(n_candidates)):
        distances=[kendalltau_dist(candidate_rank, rank) for rank in ranks]
        dist = np.sum(distances)
        if dist < min_dist:
            min_dist = dist
            best_rank = candidate_rank
    return min_dist, best_rank


def parse_rankings(file_path):
    rankings = []
    cols=[]
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                ranking_str = line.split(':')[1].strip()
                ranking_values = [col[1:-1] for col in ranking_str.split()]
                if cols==[]:
                    cols=ranking_values
                    rankings.append(list(range(1,len(ranking_values)+1)))
                else:
                    rankings.append([cols.index(temp) for temp in ranking_values])
    return np.array(rankings),cols



In [5]:
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#Tuning parameters for "Agglomerative"
#-------------------------------------------------------------------------------

import os,csv
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

param_grid = {
    'n_clusters': [3],
    'affinity': ['euclidean', 'manhattan'],
    'linkage': ['ward', 'complete', 'average']
}

files = ["test1.xlsx", "test2.xlsx", "test3.xlsx", "test4.xlsx"]
files_p = processing_procedure(files)
for file in files_p:
    df = pd.read_excel(file, engine='openpyxl')
    print(f"NaN values in {file}:", df.isnull().sum().sum())
columns_to_drop = ['Depth', 'X Position', 'Y Position', 'Z Position', 'Load', 'Stiffness' , 'label']

eval_files = []
tuning_files = []

# Mapping of files to the position we're interested in
position_mapping = {
    0: 'X Position',  # test1
    1: 'Y Position',  # test2
    2: 'X Position',  # test3
    3: 'Y Position'   # test4
}
for idx, file in enumerate(files_p):
    if not os.path.exists("eval_datasets"):
        os.mkdir("eval_datasets")

    if not os.path.exists("tuning_datasets"):
        os.mkdir("tuning_datasets")

    df = pd.read_excel(file, engine='openpyxl')


    position = position_mapping[idx]
    eval_data = df[df[position] < 125]


    eval_filename = "eval_datasets/"+ f'test_{idx+1}_eval.xlsx'
    eval_data.to_excel(eval_filename, index=False)
    eval_files.append(eval_filename)

    # tuning
    position = position_mapping[idx]
    tuning_data = df[df[position] > 125]


    tuning_filename ="tuning_datasets/"+ f'test_{idx+1}_tuning.xlsx'
    tuning_data.to_excel(tuning_filename, index=False)
    tuning_files.append(tuning_filename)




results = []
max_silhouette = -1
best_params = {}

for eval_filename in eval_files:
    eval_df = pd.read_excel(eval_filename, engine='openpyxl')
    eval_df = eval_df.drop(columns=columns_to_drop, errors='ignore')



    scaled_eval = StandardScaler().fit_transform(eval_df)

    for n_clusters in param_grid['n_clusters']:
        for affinity in param_grid['affinity']:
            for linkage in param_grid['linkage']:
                # 'ward' can only work with 'euclidean'
                if linkage == 'ward' and affinity != 'euclidean':
                    continue

                model = AgglomerativeClustering(n_clusters=n_clusters, affinity= affinity, linkage=linkage)
                labels = model.fit_predict(scaled_eval)

                score = silhouette_score(scaled_eval, labels)

                # Append results to the list
                results.append({
                    'file': eval_filename,
                    'n_clusters': n_clusters,
                    'affinity': affinity,
                    'linkage': linkage,
                    'silhouette_score': score
                })



df_results = pd.DataFrame(results)

def concat_parameters(group):
    # This will force even single items into a comma-separated string format

    return ' '.join(group['parameters_combinations'].tolist())

df_results['parameters_combinations'] = df_results.iloc[:, 1:4].apply(lambda row: '"' + ','.join(row.dropna().astype(str)) + '"', axis=1)
df_results = df_results.drop(df_results.columns[1:4], axis=1)
df_results = df_results.sort_values(by='silhouette_score', ascending=False)
df_results['rank'] = df_results.groupby('file')['silhouette_score'].rank(method='first', ascending=False).astype(int)

grouped_combinations = df_results.groupby(['file']).apply(concat_parameters)

if not os.path.exists("parameter_list"):
        os.mkdir("parameter_list")

output_file = "parameter_list/Agglomerative+parameterTuning.xlsx"
df_results.to_excel(output_file, index=False)


if not os.path.exists("rank_list"):
        os.mkdir("rank_list")
with open('rank_list/tuning_list_kemeny_agglomerative.txt', 'w') as f:
    i = 1
    for _, group_string in grouped_combinations.items():
        f.write(f'A{i} : {group_string}\n')
        i += 1




precessed_datasets/test1_processed.xlsx already exists.
precessed_datasets/test2_processed.xlsx already exists.
precessed_datasets/test3_processed.xlsx already exists.
precessed_datasets/test4_processed.xlsx already exists.
NaN values in precessed_datasets/test1_processed.xlsx: 0
NaN values in precessed_datasets/test2_processed.xlsx: 0
NaN values in precessed_datasets/test3_processed.xlsx: 0
NaN values in precessed_datasets/test4_processed.xlsx: 0


In [9]:



file_path = "rank_list/tuning_list_kemeny_agglomerative.txt" 
rankings,cols = parse_rankings(file_path)

dist, aggr = rankaggr_brute(rankings)

print_rankings(dist, aggr,cols)
Tuning_Agglo = get_array_rankings(aggr,cols)
print(Tuning_Agglo)



A Kemeny-Young aggregation with score 6 is:
 3,euclidean,average
 3,manhattan,complete
 3,manhattan,average
 3,euclidean,complete
 3,euclidean,ward
['3,euclidean,average', '3,manhattan,complete', '3,manhattan,average', '3,euclidean,complete', '3,euclidean,ward']
