In [14]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score, calinski_harabasz_score

#Function to prompt the user to provide the filename.
def get_filename():
    
    while True:
        filename = input('Import a csv or an excel file: ')
        if os.path.exists(filename):
            if filename.split('.')[-1].lower() == 'csv' or filename.split('.')[-1].lower() == 'xlsx':
                return filename
            else:
                print('Invalid file format. Please try again')
        else:
            print(f'Error: There is no file called: {filename}. Please try again.')
    
#Funtion to read the file based on its extension.    
def read_file(filename,file_extension):
    
    if file_extension == 'csv':
        while True:
            delim = input('Please enter the delimiter of the csv file. It must be either ";" or ",": ')
            if delim == ',' or delim == ';':
                return pd.read_csv(filename, delimiter = delim)
            else:
                print('Invalid delimiter. Please try again.')
    else:
        return pd.read_excel(filename)

#Function that fills the missing values with the mean of the variables
def fill_missing_values(df):
    df_filled = df.fillna(df.mean())
    return df_filled   

#Function that removes the duplicate instances
def remove_duplicates(data):
    df_without_duplicates = data.drop_duplicates()
    return df_without_duplicates

#Function that encodes categorical data into numerical
def encode_categorical_data(data):
    for column in data.columns[:]:
        if data[column].dtype == 'object':
            data[column] = pd.Categorical(data[column]).codes
    return data

#Function that removes the outliers
def remove_outliers_iqr(df, iqr_factor=1.5):
    conditions = []
    for col in df.columns[:]:
        lower_bound = df[col].quantile(5 / 100)
        upper_bound = df[col].quantile(95 / 100)
        condition = (df[col] < lower_bound) | (df[col] > upper_bound)
        conditions.append(condition)
        
    combined_condition = ~pd.concat(conditions, axis=1).any(axis=1)
    return df[combined_condition]

#Function that performs dimensionality reduction using the PCA algorithm
def perform_pca(data):
    if len(data.columns) >10:
        pca = PCA(n_components = 10)
        pca.fit(data)
        return pd.DataFrame(pca.transform(data))

#Function for the preprocess of the data
def data_preprocess_for_the_clustering_algorithms(data):
    data = fill_missing_values(data)
    data = remove_duplicates(data)
    data = encode_categorical_data(data)
    data = remove_outliers_iqr(data)
    data = perform_pca(data)
    return data

#Function that implements the k-means clustering algorithm
def kmeans_algorithm(data_frame):
    X = data_preprocess_for_the_clustering_algorithms(data_frame.iloc[:, :-1])
    while True:
        try:
            n_clusters = input('Specify the number of clusters "k" for the k-means clustering algorithm: ')
            n_clusters = int(n_clusters)
            if n_clusters <= len(X) and n_clusters > 1:
                break
            else:
                print('The number of clusters must be more than 1 and cannot exceed the number of instances. Please try again')
                continue
        except:
            print('Invalid input. Please try again')
            
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(X)
    labels = kmeans.labels_
    inertia = kmeans.inertia_
    silhouette = round((silhouette_score(X, labels)*100),3)
    print(f"Silhouette Score: {silhouette}")


filename = get_filename()
file_extension = filename.split('.')[-1].lower()
data_frame = read_file(filename,file_extension)
kmeans_algorithm(data_frame)

Import a csv or an excel file:  1.csv
Please enter the delimiter of the csv file. It must be either ";" or ",":  ;
Specify the number of clusters "k" for the k-means clustering algorithm:  1


The number of clusters must be more than 1 and cannot exceed the number of instances. Please try again


Specify the number of clusters "k" for the k-means clustering algorithm:  5


Silhouette Score: 69.145
