In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#Function that fills the missing values with the mean of the variables
def fill_missing_values(df):
    df_filled = df.fillna(df.mean())
    return df_filled   

#Function that removes the duplicate instances
def remove_duplicates(data):
    df_without_duplicates = data.drop_duplicates()
    return df_without_duplicates

#Function that encodes categorical data into numerical
def encode_categorical_data(data):
    for column in data.columns[:]:
        if data[column].dtype == 'object':
            data[column] = pd.Categorical(data[column]).codes
    return data

#Function that removes the outliers
def remove_outliers_iqr(df, iqr_factor=1.5):
    conditions = []
    for col in df.columns[:]:
        lower_bound = df[col].quantile(5 / 100)
        upper_bound = df[col].quantile(95 / 100)
        condition = (df[col] < lower_bound) | (df[col] > upper_bound)
        conditions.append(condition)
        
    combined_condition = ~pd.concat(conditions, axis=1).any(axis=1)
    return df[combined_condition]

#Function that performs dimensionality reduction using the PCA algorithm
def perform_pca(data):
    if len(data.columns) >10:
        pca = PCA(n_components = 10)
        pca.fit(data)
        return pd.DataFrame(pca.transform(data))

#Function for the preprocess of the data
def data_preprocess_for_the_clustering_algorithms(data):
    data = fill_missing_values(data)
    data = remove_duplicates(data)
    data = encode_categorical_data(data)
    data = remove_outliers_iqr(data)
    data = perform_pca(data)
    return data