In [37]:
import os
import pandas as pd
import numpy as np
from tabulate import tabulate
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score

#Function to prompt the user to provide the filename.
def get_filename():
    
    while True:
        filename = input('Import a csv or an excel file: ')
        if os.path.exists(filename):
            if filename.split('.')[-1].lower() == 'csv' or filename.split('.')[-1].lower() == 'xlsx':
                return filename
            else:
                print('Invalid file format. Please try again')
        else:
            print(f'Error: There is no file called: {filename}. Please try again.')
    
#Funtion to read the file based on its extension.    
def read_file(filename,file_extension):
    
    if file_extension == 'csv':
        while True:
            delim = input('Please enter the delimiter of the csv file. It must be either ";" or ",": ')
            if delim == ',' or delim == ';':
                return pd.read_csv(filename, delimiter = delim)
            else:
                print('Invalid delimiter. Please try again.')
    else:
        return pd.read_excel(filename)

#Function that fills the missing values with the mean of the variables
def fill_missing_values(df):
    df_filled = df.fillna(df.mean())
    return df_filled   

#Function that removes the duplicate instances
def remove_duplicates(data):
    df_without_duplicates = data.drop_duplicates()
    return df_without_duplicates

#Function that encodes categorical data into numerical
def encode_categorical_data(data):
    for column in data.columns[:]:
        if data[column].dtype == 'object':
            data[column] = pd.Categorical(data[column]).codes
    return data

#Function that removes the outliers
def remove_outliers_iqr(df, iqr_factor=1.5):
    conditions = []
    for col in df.columns[:]:
        lower_bound = df[col].quantile(5 / 100)
        upper_bound = df[col].quantile(95 / 100)
        condition = (df[col] < lower_bound) | (df[col] > upper_bound)
        conditions.append(condition)
        
    combined_condition = ~pd.concat(conditions, axis=1).any(axis=1)
    return df[combined_condition]

#Function that performs dimensionality reduction using the PCA algorithm
def perform_pca(data):
    if len(data.columns) >10:
        pca = PCA(n_components = 10)
        pca.fit(data)
        return pd.DataFrame(pca.transform(data))

#Function for the preprocess of the data
def data_preprocess_for_the_clustering_algorithms(data):
    data = fill_missing_values(data)
    data = remove_duplicates(data)
    data = encode_categorical_data(data)
    data = remove_outliers_iqr(data)
    data = perform_pca(data)
    return data    

#Function that implements the Hierarchical clustering algorithm
def hierarchical_clustering(data, linkage='ward'):
    data = data_preprocess_for_the_clustering_algorithms(data.iloc[:, :-1])
    metrics = {
    "1": "euclidean",
    "2": "cityblock",
    "3": "minkowski",
    "4": "chebyshev" 
    }
    
    print("Available Distance Metrics:")
    print(tabulate(metrics.items(), headers=["Metric"], tablefmt="fancy_grid"))

    while True:
        try:
            metric_choice = input('Type the corresponding number of the metric you would like to use for clustering: ')
            metric_choice = int(metric_choice)
            if 1 <= metric_choice <= 4:  # Use inclusive range for valid options (1-4)
                break
            else:
                print('There are only 4 options. Please try again')
                continue
        except ValueError:
            print('Invalid input. Please enter a number between 1 and 4.')

    metric = metrics[str(metric_choice)]
            
    distance_matrix = pdist(data, metric=metric)
    cluster_result = hierarchy.linkage(distance_matrix, method=linkage)
    cluster_labels = fcluster(cluster_result, 2, criterion='maxclust')
    silhouette = round((silhouette_score(data, cluster_labels, metric=metric) * 100), 3)
    print(f"Silhouette Score: {silhouette}")
    
filename = get_filename()
file_extension = filename.split('.')[-1].lower()
data_frame = read_file(filename,file_extension)
hierarchical_clustering(data_frame)

Import a csv or an excel file:  4.csv
Please enter the delimiter of the csv file. It must be either ";" or ",":  ,


Available Distance Metrics:
╒════╤═══════════╕
│    │ Metric    │
╞════╪═══════════╡
│  1 │ euclidean │
├────┼───────────┤
│  2 │ cityblock │
├────┼───────────┤
│  3 │ minkowski │
├────┼───────────┤
│  4 │ chebyshev │
╘════╧═══════════╛


Type the corresponding number of the metric you would like to use for clustering:  1


Silhouette Score: 74.237
