In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# magic word for producing visualizations in notebook
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')



In [None]:
azdias = pd.read_csv('../data/azdias.csv', sep=';')
customers=pd.read_csv('../data/customers.csv',sep=';')

In [None]:
# Mapping dtypes by variable:
dtypes_dict = {'AKT_DAT_KL': 'num', 'ALTER_HH': 'num', 'ALTERSKATEGORIE_FEIN': 'num', 'ANZ_HAUSHALTE_AKTIV': 'num',
               'ANZ_HH_TITEL': 'num', 'ANZ_KINDER': 'num', 'ANZ_PERSONEN': 'num', 'ANZ_STATISTISCHE_HAUSHALTE': 'num',
               'ANZ_TITEL': 'num', 'ARBEIT': 'num', 'BALLRAUM': 'num', 'CAMEO_DEU_2015': 'cat', 'CAMEO_DEUG_2015': 'num',
               'CAMEO_INTL_2015': 'num', 'CJT_GESAMTTYP': 'cat', 'CJT_KATALOGNUTZER': 'num', 'CJT_TYP_1': 'num',
               'CJT_TYP_2': 'num', 'CJT_TYP_3': 'num', 'CJT_TYP_4': 'num', 'CJT_TYP_5': 'num', 'CJT_TYP_6': 'num',
               'D19_LETZTER_KAUF_BRANCHE': 'cat', 'DSL_FLAG': 'bin', 'EWDICHTE': 'num', 'FINANZ_ANLEGER': 'num',
               'FINANZ_HAUSBAUER': 'num', 'FINANZ_MINIMALIST': 'num', 'FINANZ_SPARER': 'num', 'FINANZ_UNAUFFAELLIGER': 'num',
               'FINANZ_VORSORGER': 'num', 'FINANZTYP': 'cat', 'FIRMENDICHTE': 'num', 'GEBAEUDETYP': 'cat',
               'GEBAEUDETYP_RASTER': 'num', 'GEMEINDETYP': 'cat', 'GFK_URLAUBERTYP': 'cat', 'GREEN_AVANTGARDE': 'bin',
               'HEALTH_TYP': 'cat', 'HH_DELTA_FLAG': 'bin', 'HH_EINKOMMEN_SCORE': 'num', 'INNENSTADT': 'num',
               'KBA05_ALTER1': 'num', 'KBA05_ALTER2': 'num', 'KBA05_ALTER3': 'num', 'KBA05_ALTER4': 'num',
               'KBA05_ANHANG': 'num', 'KBA05_ANTG1': 'num', 'KBA05_ANTG2': 'num', 'KBA05_ANTG3': 'num', 'KBA05_ANTG4': 'num',
               'KBA05_AUTOQUOT': 'num', 'KBA05_CCM1': 'num', 'KBA05_CCM2': 'num', 'KBA05_CCM3': 'num', 'KBA05_CCM4': 'num',
               'KBA05_DIESEL': 'num', 'KBA05_FRAU': 'num', 'KBA05_GBZ': 'num', 'KBA05_HERST1': 'num', 'KBA05_HERST2': 'num',
               'KBA05_HERST3': 'num', 'KBA05_HERST4': 'num', 'KBA05_HERST5': 'num', 'KBA05_HERSTTEMP': 'cat',
               'KBA05_KRSAQUOT': 'num', 'KBA05_KRSHERST1': 'num', 'KBA05_KRSHERST2': 'num', 'KBA05_KRSHERST3': 'num',
               'KBA05_KRSKLEIN': 'num', 'KBA05_KRSOBER': 'num', 'KBA05_KRSVAN': 'num', 'KBA05_KRSZUL': 'num',
               'KBA05_KW1': 'num', 'KBA05_KW2': 'num', 'KBA05_KW3': 'num', 'KBA05_MAXAH': 'num', 'KBA05_MAXBJ': 'num',
               'KBA05_MAXHERST': 'cat', 'KBA05_MAXSEG': 'num', 'KBA05_MAXVORB': 'num', 'KBA05_MOD1': 'num',
               'KBA05_MOD2': 'num', 'KBA05_MOD3': 'num', 'KBA05_MOD4': 'num', 'KBA05_MOD8': 'num', 'KBA05_MODTEMP': 'cat',
               'KBA05_MOTOR': 'num', 'KBA05_MOTRAD': 'num', 'KBA05_SEG1': 'num', 'KBA05_SEG10': 'num', 'KBA05_SEG2': 'num',
               'KBA05_SEG3': 'num', 'KBA05_SEG4': 'num', 'KBA05_SEG5': 'num', 'KBA05_SEG6': 'num', 'KBA05_SEG7': 'num',
               'KBA05_SEG8': 'num', 'KBA05_SEG9': 'num', 'KBA05_VORB0': 'num', 'KBA05_VORB1': 'num', 'KBA05_VORB2': 'num',
               'KBA05_ZUL1': 'num', 'KBA05_ZUL2': 'num', 'KBA05_ZUL3': 'num', 'KBA05_ZUL4': 'num',
               'KBA13_ALTERHALTER_30': 'num', 'KBA13_ALTERHALTER_45': 'num', 'KBA13_ALTERHALTER_60': 'num',
               'KBA13_ALTERHALTER_61': 'num', 'KBA13_ANTG1': 'num', 'KBA13_ANTG2': 'num', 'KBA13_ANTG3': 'num',
               'KBA13_ANTG4': 'num', 'KBA13_ANZAHL_PKW': 'num', 'KBA13_AUDI': 'num', 'KBA13_AUTOQUOTE': 'num',
               'KBA13_BAUMAX': 'num', 'KBA13_BJ_1999': 'num', 'KBA13_BJ_2000': 'num', 'KBA13_BJ_2004': 'num',
               'KBA13_BJ_2006': 'num', 'KBA13_BJ_2008': 'num', 'KBA13_BJ_2009': 'num', 'KBA13_BMW': 'num',
               'KBA13_CCM_0_1400': 'num', 'KBA13_CCM_1000': 'num', 'KBA13_CCM_1200': 'num', 'KBA13_CCM_1400': 'num',
               'KBA13_CCM_1401_2500': 'num', 'KBA13_CCM_1500': 'num', 'KBA13_CCM_1600': 'num', 'KBA13_CCM_1800': 'num',
               'KBA13_CCM_2000': 'num', 'KBA13_CCM_2500': 'num', 'KBA13_CCM_2501': 'num', 'KBA13_CCM_3000': 'num',
               'KBA13_CCM_3001': 'num', 'KBA13_FAB_ASIEN': 'num', 'KBA13_FAB_SONSTIGE': 'num', 'KBA13_FIAT': 'num',
               'KBA13_FORD': 'num', 'KBA13_GBZ': 'num', 'KBA13_HALTER_20': 'num', 'KBA13_HALTER_25': 'num',
               'KBA13_HALTER_30': 'num', 'KBA13_HALTER_35': 'num', 'KBA13_HALTER_40': 'num', 'KBA13_HALTER_45': 'num',
               'KBA13_HALTER_50': 'num', 'KBA13_HALTER_55': 'num', 'KBA13_HALTER_60': 'num', 'KBA13_HALTER_65': 'num',
               'KBA13_HALTER_66': 'num', 'KBA13_HERST_ASIEN': 'num', 'KBA13_HERST_AUDI_VW': 'num',
               'KBA13_HERST_BMW_BENZ': 'num', 'KBA13_HERST_EUROPA': 'num', 'KBA13_HERST_FORD_OPEL': 'num',
               'KBA13_HERST_SONST': 'num', 'KBA13_HHZ': 'num', 'KBA13_KMH_0_140': 'num', 'KBA13_KMH_110': 'num',
               'KBA13_KMH_140': 'num', 'KBA13_KMH_140_210': 'num', 'KBA13_KMH_180': 'num', 'KBA13_KMH_210': 'num',
               'KBA13_KMH_211': 'num', 'KBA13_KMH_250': 'num', 'KBA13_KMH_251': 'num', 'KBA13_KRSAQUOT': 'num',
               'KBA13_KRSHERST_AUDI_VW': 'num', 'KBA13_KRSHERST_BMW_BENZ': 'num', 'KBA13_KRSHERST_FORD_OPEL': 'num',
               'KBA13_KRSSEG_KLEIN': 'num', 'KBA13_KRSSEG_OBER': 'num', 'KBA13_KRSSEG_VAN': 'num', 'KBA13_KRSZUL_NEU': 'num',
               'KBA13_KW_0_60': 'num', 'KBA13_KW_110': 'num', 'KBA13_KW_120': 'num', 'KBA13_KW_121': 'num',
               'KBA13_KW_30': 'num', 'KBA13_KW_40': 'num', 'KBA13_KW_50': 'num', 'KBA13_KW_60': 'num',
               'KBA13_KW_61_120': 'num', 'KBA13_KW_70': 'num', 'KBA13_KW_80': 'num', 'KBA13_KW_90': 'num',
               'KBA13_MAZDA': 'num', 'KBA13_MERCEDES': 'num', 'KBA13_MOTOR': 'num', 'KBA13_NISSAN': 'num', 'KBA13_OPEL': 'num',
               'KBA13_PEUGEOT': 'num', 'KBA13_RENAULT': 'num', 'KBA13_SEG_GELAENDEWAGEN': 'num',
               'KBA13_SEG_GROSSRAUMVANS': 'num', 'KBA13_SEG_KLEINST': 'num', 'KBA13_SEG_KLEINWAGEN': 'num',
               'KBA13_SEG_KOMPAKTKLASSE': 'num', 'KBA13_SEG_MINIVANS': 'num', 'KBA13_SEG_MINIWAGEN': 'num',
               'KBA13_SEG_MITTELKLASSE': 'num', 'KBA13_SEG_OBEREMITTELKLASSE': 'num', 'KBA13_SEG_OBERKLASSE': 'num',
               'KBA13_SEG_SONSTIGE': 'num', 'KBA13_SEG_SPORTWAGEN': 'num', 'KBA13_SEG_UTILITIES': 'num',
               'KBA13_SEG_VAN': 'num', 'KBA13_SEG_WOHNMOBILE': 'num', 'KBA13_SITZE_4': 'num', 'KBA13_SITZE_5': 'num',
               'KBA13_SITZE_6': 'num', 'KBA13_TOYOTA': 'num', 'KBA13_VORB_0': 'num', 'KBA13_VORB_1': 'num',
               'KBA13_VORB_1_2': 'num', 'KBA13_VORB_2': 'num', 'KBA13_VORB_3': 'num', 'KBA13_VW': 'num', 'KKK': 'num',
               'KOMBIALTER': 'num', 'KONSUMNAEHE': 'num', 'KONSUMZELLE': 'bin', 'LP_FAMILIE_FEIN': 'cat',
               'LP_FAMILIE_GROB': 'num', 'LP_LEBENSPHASE_FEIN': 'cat', 'LP_LEBENSPHASE_GROB': 'cat', 'LP_STATUS_FEIN': 'num',
               'LP_STATUS_GROB': 'num', 'MIN_GEBAEUDEJAHR': 'num', 'MOBI_RASTER': 'num', 'MOBI_REGIO': 'num',
               'NATIONALITAET_KZ': 'cat', 'ONLINE_AFFINITAET': 'num', 'ORTSGR_KLS9': 'num', 'OST_WEST_KZ': 'cat',
               'PLZ8_ANTG1': 'num', 'PLZ8_ANTG2': 'num', 'PLZ8_ANTG3': 'num', 'PLZ8_ANTG4': 'num', 'PLZ8_BAUMAX': 'cat',
               'PLZ8_GBZ': 'num', 'PLZ8_HHZ': 'num', 'PRAEGENDE_JUGENDJAHRE': 'num', 'REGIOTYP': 'num', 'RELAT_AB': 'num',
               'RETOURTYP_BK_S': 'cat', 'RT_KEIN_ANREIZ': 'num', 'RT_SCHNAEPPCHEN': 'num', 'RT_UEBERGROESSE': 'num',
               'SEMIO_DOM': 'num', 'SEMIO_ERL': 'num', 'SEMIO_FAM': 'num', 'SEMIO_KAEM': 'num', 'SEMIO_KRIT': 'num',
               'SEMIO_KULT': 'num', 'SEMIO_LUST': 'num', 'SEMIO_MAT': 'num', 'SEMIO_PFLICHT': 'num', 'SEMIO_RAT': 'num',
               'SEMIO_REL': 'num', 'SEMIO_SOZ': 'num', 'SEMIO_TRADV': 'num', 'SEMIO_VERT': 'num', 'SHOPPER_TYP': 'cat',
               'SOHO_KZ': 'bin', 'STRUKTURTYP': 'cat', 'UMFELD_ALT': 'num', 'UMFELD_JUNG': 'num', 'UNGLEICHENN_FLAG': 'bin',
               'VERDICHTUNGSRAUM': 'num', 'VERS_TYP': 'cat', 'VHA': 'num', 'VHN': 'num', 'VK_DHT4A': 'num',
               'VK_DISTANZ': 'num', 'VK_ZG11': 'num', 'W_KEIT_KIND_HH': 'num', 'WOHNDAUER_2008': 'num', 'WOHNLAGE': 'cat',
               'ZABEOTYP': 'cat', 'ANREDE_KZ': 'cat', 'ALTERSKATEGORIE_GROB': 'num'}


### 1.5 Cluster Analysis<a name="cluster"></a>

Now, unsupervised machine learning techniques will be used in order to identify hidden patterns in the data, clustering the population into different groups, each one composed of people with similar characteristics.

With the defined clusters, it will be possible to perform a new comparison between customers and the general population. The difference is that, this time, the comparison won't be performed over one dimension (one variable), but over the different groups created through the effect that all the variables together have on these groups.

#### 1.5.1 Feature Engineering<a name="feateng2"></a>

First, there will be one more feature engineering process, this time over `CAMEO_DEU_2015` feature. Since there are 44 different classifications, they will be grouped according to the behavior presented when comparing customers and the general population, following this code:
* `0`: *underrepresented classes among clients*;
* `1`: *almost equally represented with slight underrepresentation tendency*;
* `2`: *almost equally represented with slight overrepresentation tendency*;
* `3`: *overrepresented classes among clients*.

In [None]:
# Defining CAMEO_DEU_2015 transformation:
def transform_cameo_deu(df):
    '''
    It simplifies CAMEO_DEU_2015 classes according to the representation pattern presented in the comparison
    between customers and the general population.
    '''
    # Creating new column:
    feat = 'CAMEO_DEU_2015'
    df['CAMEO_DEU_REPRESENTATION'] = [0 if df[feat].iloc[i] in ['6A', '7A', '7B', '7C', '8A', '8B', '8C', '8D', '9A', '9B',
                                                                '9C', '9D'] \
                                      else 1 if df[feat].iloc[i] in ['5A', '5B', '5C', '6B', '7D'] \
                                      else 2 if df[feat].iloc[i] in ['3A', '3B', '3C', '4B', '4C', '4E', '5E', '5F', '6C',
                                                                     '6D', '6E', '6F', '7E', '9E'] \
                                      else 3 if df[feat].iloc[i] in ['1A', '1B', '1C', '1D', '2A', '2B', '2C', '2D', '3D',
                                                                     '4A', '5D'] \
                                      else np.nan for i in range(df.shape[0])]

    # Transforming the column to categorical type:
    df['CAMEO_DEU_REPRESENTATION'] = df['CAMEO_DEU_REPRESENTATION'].astype('category')

    # Deliting original column:
    df.drop(columns = [feat], inplace = True)

    return df

In [None]:
# Applying transform_cameo_deu on azdias dataframe:
azdias = transform_cameo_deu(azdias)

Since new features were created during the process, a complementary dictionary will be created to specify the new columns dtypes:

In [None]:
# Defining dtypes for new features:
new_feat_dtypes_dict = {'AVANT_GARDE': 'cat',
                        'CAMEO_DEU_REPRESENTATION': 'num',
                        'CAMEO_INTL_FAM_COMPOSITION': 'num',
                        'CAMEO_INTL_FAM_STATUS': 'num',
                        'YOUTH_DECADE': 'num'}

#### 1.5.2 NaN Values<a name="nan"></a>

The approach that will be used to fill in *nan* values is the following:
* `Numerical` and `ordinal` variables will have *nan* values replaced with the median of the existing values;

* `Categorical` variables will be binarized (One-Hot Encoding), and *nan* values will be indirectly considered.

In [None]:
# Defining function to replace nan values with the median or create dummy variables for following specific distribution:
def deal_with_nan_values(df, dict1 = dtypes_dict, dict2 = new_feat_dtypes_dict):
    '''
    It deals with nan values in two different ways:
    * if the feature is defined as 'num' (numerical), it fills nan values with the median value;
    * if the feature is defined as 'cat' (categorical) or 'bin' (binary), it creates dummy variables
      and deletes the original column.

    Inputs:
    df: original dataframe;
    dict1: main dictionary mapping columns and dtypes;
    dict2: auxiliar dictionary mapping new columns and dtypes.

    Output:
    df: dataframe without nan values.
    '''
    # Selecting columns with nan values:
    nan_cols = list(df.columns[df.isnull().sum() != 0])

    # For each column, replace nan values:
    for col in nan_cols:

        # Verify in which dictionary the column is:
        if col in list(dict1.keys()):
            dict_ = dict1
        else:
            dict_ = dict2

        # Verify dtype, if numeric:
        if dict_[col] == 'num':
            # Fill in nan values with median:
            df[col].fillna(np.nanmedian(df[col]), inplace = True)

        else:
            # Delete original column and cocatenate dummy columns:
            df = pd.concat([df.drop(col, axis = 1), pd.get_dummies(df[col], prefix = col, \
                                                                   prefix_sep = '_', dummy_na = False)], axis = 1)

    return df

#### 1.5.3 Standardizing Data<a name="standard"></a>

Even after the feature selection process, many columns were left to be analyzed. Because of that, Principal Component Analysis will be applied to the data.

In order to apply the `PCA` algorithm, the values need to be on the same scale. For that, one function will be defined to fit the model to the data, and another to use the fitted model to transform the data:

In [None]:
# Fitting StandardScaler model:
def fit_std_scaler(df):
    '''
    It uses the dataframe to fit the StandardScaler model.

    Input:
    df: dataframe used to fit the model.

    Output:
    cols: list of columns used in the fitting process;
    std_model: fitted StandardScaler model.
    '''
    # Instantiating scaler:
    std_scal = StandardScaler()

    # Columns used:
    cols = list(df.columns)

    # Fitting the model:
    std_scal.fit(df)

    return cols, std_scal

In [None]:
# Fitting StandardScaler model:
def transform_std_scaler(df, cols, scaler):
    '''
    It uses the fitted StandardScaler model to transform the data.

    Input:
    df: dataframe used to be transformed;
    cols: list of columns to assure dataframe compatibility.
    scaler: fitted StandardScaler model.

    Output:
    df: transformed dataframe.
    '''
    # Dataframe columns compatibility:
    for col in cols:
        if col not in df.columns:
            df[col] = 0

    # Scaling data:
    df = pd.DataFrame(scaler.transform(df), columns = df.columns)

    return df

Since there are different levels of information, according to the informational spreadsheet, the dimensionality reduction, in this case, the `PCA` technique, will not be applied to the whole dataset at once. Different components will be created for different levels of information.

Because of that, the dataset will be split according to their information level, and the transformations will be applied to these subsets.

In [None]:
# Creating dictionary to map each feature to its correspondent information level:
info_level = {'person': ['ANREDE_KZ', 'ALTERSKATEGORIE_GROB', 'AVANT_GARDE', 'CJT_GESAMTTYP', 'CJT_KATALOGNUTZER', 'CJT_TYP_6',
                         'FINANZ_ANLEGER', 'FINANZ_HAUSBAUER', 'FINANZTYP', 'GFK_URLAUBERTYP', 'HEALTH_TYP', 'LP_FAMILIE_FEIN',
                         'LP_FAMILIE_GROB', 'LP_LEBENSPHASE_FEIN', 'LP_LEBENSPHASE_GROB', 'LP_STATUS_GROB', 'NATIONALITAET_KZ',
                         'OST_WEST_KZ', 'RETOURTYP_BK_S', 'SEMIO_ERL', 'SEMIO_FAM', 'SEMIO_KRIT', 'SEMIO_KULT', 'SOHO_KZ',
                         'SEMIO_LUST', 'SEMIO_MAT', 'SEMIO_RAT', 'SEMIO_REL', 'SEMIO_SOZ', 'SEMIO_TRADV', 'SEMIO_VERT',
                         'SHOPPER_TYP', 'VERS_TYP', 'YOUTH_DECADE', 'ZABEOTYP'],
              'household': ['AKT_DAT_KL', 'ANZ_KINDER', 'ANZ_STATISTISCHE_HAUSHALTE', 'ANZ_TITEL', 'CAMEO_DEU_REPRESENTATION',
                            'CAMEO_INTL_FAM_COMPOSITION', 'CAMEO_INTL_FAM_STATUS', 'D19_LETZTER_KAUF_BRANCHE',
                            'HH_EINKOMMEN_SCORE', 'WOHNDAUER_2008', 'W_KEIT_KIND_HH', 'RT_KEIN_ANREIZ', 'RT_SCHNAEPPCHEN',
                            'RT_UEBERGROESSE'],
              'microcell': ['ANZ_HH_TITEL', 'DSL_FLAG', 'HH_DELTA_FLAG', 'KBA05_ALTER1', 'KBA05_ALTER2', 'KBA05_ALTER3',
                            'KBA05_ALTER4', 'KBA05_ANHANG', 'KBA05_ANTG2', 'KBA05_ANTG3', 'KBA05_ANTG4', 'KBA05_CCM1',
                            'KBA05_CCM2', 'KBA05_CCM3', 'KBA05_CCM4', 'KBA05_DIESEL', 'KBA05_FRAU', 'KBA05_HERST4',
                            'KBA05_HERST5', 'KBA05_HERSTTEMP', 'KBA05_KRSAQUOT', 'KBA05_KRSHERST1', 'KBA05_KRSHERST2',
                            'KBA05_KRSHERST3', 'KBA05_KW2', 'KBA05_KW3', 'KBA05_MAXAH', 'KBA05_MAXBJ', 'KBA05_MAXHERST',
                            'KBA05_MAXSEG', 'KBA05_MAXVORB', 'KBA05_MOD1', 'KBA05_MOD2', 'KBA05_MOD3', 'KBA05_MOD4',
                            'KBA05_MODTEMP', 'KBA05_MOTOR', 'KBA05_MOTRAD', 'KBA05_SEG1', 'KBA05_SEG10', 'KBA05_SEG2',
                            'KBA05_SEG3', 'KBA05_SEG4', 'KBA05_SEG5', 'KBA05_SEG6', 'KBA05_SEG7', 'KBA05_SEG8', 'KBA05_SEG9',
                            'KBA05_VORB0', 'KBA05_VORB1', 'KBA05_VORB2', 'KBA05_ZUL1', 'KBA05_ZUL2', 'KBA05_ZUL3',
                            'KBA05_ZUL4', 'KONSUMZELLE', 'MIN_GEBAEUDEJAHR', 'STRUKTURTYP', 'UMFELD_ALT', 'UMFELD_JUNG',
                            'WOHNLAGE'],
              'macrocell': ['BALLRAUM', 'GEBAEUDETYP_RASTER', 'GEBAEUDETYP', 'GEMEINDETYP', 'INNENSTADT', 'KBA13_AUDI',
                            'KBA13_AUTOQUOTE', 'KBA13_BJ_2000', 'KBA13_BJ_2006', 'KBA13_BJ_2008', 'KBA13_BJ_2009', 'KBA13_BMW',
                            'KBA13_CCM_0_1400', 'KBA13_CCM_1000', 'KBA13_CCM_1200', 'KBA13_CCM_1400', 'KBA13_CCM_1401_2500',
                            'KBA13_CCM_1500', 'KBA13_CCM_1600', 'KBA13_CCM_1800', 'KBA13_CCM_2000', 'KBA13_CCM_2500',
                            'KBA13_CCM_2501', 'KBA13_CCM_3000', 'KBA13_CCM_3001', 'KBA13_FAB_ASIEN', 'KBA13_FIAT',
                            'KBA13_FORD', 'KBA13_HALTER_20', 'KBA13_HALTER_25', 'KBA13_HALTER_35', 'KBA13_HALTER_40',
                            'KBA13_HALTER_45', 'KBA13_HALTER_50', 'KBA13_HALTER_55', 'KBA13_HALTER_60', 'KBA13_HALTER_65',
                            'KBA13_HALTER_66', 'KBA13_HERST_ASIEN', 'KBA13_HERST_EUROPA', 'KBA13_HERST_SONST', 'KBA13_KMH_140',
                            'KBA13_KMH_180', 'KBA13_KMH_210', 'KBA13_KMH_250', 'KBA13_KMH_251', 'KBA13_KRSAQUOT',
                            'KBA13_KRSHERST_AUDI_VW', 'KBA13_KRSHERST_BMW_BENZ', 'KBA13_KRSHERST_FORD_OPEL',
                            'KBA13_KRSSEG_KLEIN', 'KBA13_KRSSEG_OBER', 'KBA13_KRSZUL_NEU', 'KBA13_KW_110', 'KBA13_KW_120',
                            'KBA13_KW_121', 'KBA13_KW_30', 'KBA13_KW_40', 'KBA13_KW_50', 'KBA13_KW_60', 'KBA13_KW_61_120',
                            'KBA13_KW_70', 'KBA13_KW_80', 'KBA13_KW_90', 'KBA13_MAZDA', 'KBA13_MERCEDES', 'KBA13_MOTOR',
                            'KBA13_NISSAN', 'KBA13_OPEL', 'KBA13_PEUGEOT', 'KBA13_RENAULT', 'KBA13_SEG_GELAENDEWAGEN',
                            'KBA13_SEG_KLEINWAGEN', 'KBA13_SEG_KOMPAKTKLASSE', 'KBA13_SEG_MINIWAGEN', 'KBA13_SEG_MITTELKLASSE',
                            'KBA13_SEG_OBEREMITTELKLASSE', 'KBA13_SEG_OBERKLASSE', 'KBA13_SEG_SONSTIGE',
                            'KBA13_SEG_SPORTWAGEN', 'KBA13_SEG_UTILITIES', 'KBA13_SEG_VAN', 'KBA13_SEG_WOHNMOBILE',
                            'KBA13_SITZE_5', 'KBA13_SITZE_6', 'KBA13_TOYOTA', 'KBA13_VORB_0', 'KBA13_VORB_1', 'KBA13_VORB_1_2',
                            'KBA13_VORB_2', 'KBA13_VORB_3', 'KBA13_VW', 'MOBI_REGIO', 'PLZ8_ANTG2', 'PLZ8_ANTG4',
                            'PLZ8_BAUMAX', 'PLZ8_GBZ', 'PLZ8_HHZ', 'VK_ZG11', 'VK_DHT4A', 'VHN', 'VHA', 'VERDICHTUNGSRAUM',
                            'UNGLEICHENN_FLAG'],
              'community': ['ARBEIT', 'ORTSGR_KLS9', 'RELAT_AB']}

Next, all the transformations will be joined together in one function, and then it will be applied to the different levels of information.

In [None]:
# Defining function the apply nan and standard scaler transformations:
def transform_data(df, info_level, dic = info_level, cols = None, scaler = None):
    '''
    It applies the functions that treat nan values and standardize the values.

    Inputs:
    df: original dataframe;
    info_level: string indicating which information level will be treated;
    dic: dictionary indicating the information level of each feature.

    Output:
    df: transformed dataframe with the level of information columns.
    '''
    # Selecting information level columns:
    sel_cols = dic[info_level]
    df = df[sel_cols]

    # Applying deal_with_nan_values function:
    df = deal_with_nan_values(df)

    # If scaler is not defined, create scaler:
    if scaler == None:
        # Fitting StandardScaler model:
        cols, scaler = fit_std_scaler(df)

        # Standardizing data:
        df = transform_std_scaler(df, cols, scaler)

        return df, cols, scaler

    else:
        # Standardizing data:
        df = transform_std_scaler(df, cols, scaler)

        return df

In [None]:
# Applying data transformation on:
# 'person' info level:
azdias_pers, pers_cols, pers_scaler = transform_data(azdias, info_level = 'person')
azdias_pers.head()

In [None]:
azdias_pers.shape

In [None]:
# 'household' info level:
azdias_hh, hh_cols, hh_scaler = transform_data(azdias, info_level = 'household')
azdias_hh.head()

In [None]:
azdias_hh.shape

In [None]:
# 'microcell' info level:
azdias_mic, mic_cols, mic_scaler = transform_data(azdias, info_level = 'microcell')
azdias_mic.head()

In [None]:
azdias_mic.shape

In [None]:
# 'macrocell' info level:
azdias_mac, mac_cols, mac_scaler = transform_data(azdias, info_level = 'macrocell')
azdias_mac.head()

In [None]:
azdias_mac.shape

In [None]:
# 'community' info level:
azdias_com, com_cols, com_scaler = transform_data(azdias, info_level = 'community')
azdias_com.head()

In [None]:
azdias_com.shape

In [None]:
print('Information Level Summary:')
print('--------------------------')
print('Personal level has {} features.'.format(azdias_pers.shape[1]))
print('Household level has {} features.'.format(azdias_hh.shape[1]))
print('Microcell level has {} features.'.format(azdias_mic.shape[1]))
print('Macrocell level has {} features.'.format(azdias_mac.shape[1]))
print('Community level has {} features.'.format(azdias_com.shape[1]))

Considering the number of features after the transformations, `PCA` will be applied only on the first four levels of information:
* Personal;
* Household;
* Microcell;
* Macrocell.

The Community level has only three features, and because of that, they will be kept without further transformations.

#### 1.5.4 Dimensionality Reduction<a name="pca"></a>

Once the data is treated and standardized, before actually applying the dimensionality reduction, `PCA` will be applied with standard parameters in order to decide the number of components to keep.

To make the number of components decision, a *scree plot* will be created for each level of information data.

In [None]:
# Defining function to create scree plot to decide how many components to keep:
def create_scree_plot(pca):
    '''
    It creates a scree plot for the fitted principal component analysis in order to define the number
    of components to keep.

    Input:
    pca: principal component analysis object fitted to data.
    '''
    # Getting the variance's percentage explained by each component:
    perc_vars = pca.explained_variance_ratio_

    # Getting cumulative values of the variance's percentage:
    cum_perc_vars = np.cumsum(perc_vars)

    # Defining the number of components:
    n_comp = len(pca.explained_variance_ratio_)

    # Defining the index for each component:
    index = np.arange(n_comp)

    # Creating scree plot:
    sns.set_theme(style = "whitegrid", font_scale = 1.1)
    fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (13, 5))

    # Drawing lineplot:
    sns.lineplot(x = index, y = cum_perc_vars,
                 markers = True, color = 'black').set(xlabel = 'Number of Principal Components',
                                                      ylabel = 'Explained Variance',
                                                      title = "Scree Plot - Cumulative Explained Variance",
                                                      ylim = (0,1.02))
    sns.despine()
    fig.show()

The purpose is to retain most of the data variability, and the few components as possible, simplifying the resulted data.

Since it's a client segmentation problem, the assumption is that the `person` information level must be more important than the `household` information level, which is more important than the `microcell` and the `macrocell` information levels, being the last one the more general in this scale.

Because of that, when deciding the number of components to keep, more important levels will have the number of components necessary to explain about 60% of the variance, while less important levels will be allowed to have a lower explained variance rate.

In [None]:
# PERSON level:
pca = PCA()
pers_pca = pca.fit(azdias_pers)

# Getting the scree plot:
create_scree_plot(pers_pca)

In order to keep about 60% of the explained variance, in the `person` level **30 components** will be kept.

In [None]:
# HOUSEHOLD level:
hh_pca = pca.fit(azdias_hh)

# Getting the scree plot:
create_scree_plot(hh_pca)

In the `household` level, **20 components** explain over 50% of the data variance.

In [None]:
# MICROCELL level:
mic_pca = pca.fit(azdias_mic)

# Getting the scree plot:
create_scree_plot(mic_pca)

In [None]:
# MACROCELL level:
mac_pca = pca.fit(azdias_mac)

# Getting the scree plot:
create_scree_plot(mac_pca)

Because of the scale of importance assumed before, `microcell` and `macrocell` levels will be allowed to have their rate of explained variance between 40 and 50%, keeping **10 components** for each one of them.

<h4>Applying PCA</h4>

In [None]:
#____________________________________________________________
# PERSON:

# Instantiating pca object:
pca_pers = PCA(n_components = 30, random_state = 101)

# Fitting pca and applyting transformation on azdias dataset:
azdias_pers_pca = pca_pers.fit_transform(azdias_pers)

#____________________________________________________________
# HOUSEHOLD:

# Instantiating pca object:
pca_hh = PCA(n_components = 20, random_state = 201)

# Fitting pca and applyting transformation on azdias dataset:
azdias_hh_pca = pca_hh.fit_transform(azdias_hh)

#____________________________________________________________
# MICROCELL:

# Instantiating pca object:
pca_mic = PCA(n_components = 10, random_state = 301)

# Fitting pca and applyting transformation on azdias dataset:
azdias_mic_pca = pca_mic.fit_transform(azdias_mic)

#____________________________________________________________
# MACROCELL:

# Instantiating pca object:
pca_mac = PCA(n_components = 10, random_state = 401)

# Fitting pca and applyting transformation on azdias dataset:
azdias_mac_pca = pca_mac.fit_transform(azdias_mac)

In [None]:
# Column names for general dataframe:
person_cols = ['pers01', 'pers02', 'pers03', 'pers04', 'pers05', 'pers06', 'pers07', 'pers08', 'pers09', 'pers10', 'pers11',
               'pers12', 'pers13', 'pers14', 'pers15', 'pers16', 'pers17', 'pers18', 'pers19', 'pers20', 'pers21', 'pers22',
               'pers23', 'pers24', 'pers25', 'pers26', 'pers27', 'pers28', 'pers29', 'pers30']

household_cols = ['hh01', 'hh02', 'hh03', 'hh04', 'hh05', 'hh06', 'hh07', 'hh08', 'hh09', 'hh10', 'hh11', 'hh12', 'hh13',
                  'hh14', 'hh15', 'hh16', 'hh17', 'hh18', 'hh19', 'hh20']

microcell_cols = ['mic01', 'mic02', 'mic03', 'mic04', 'mic05', 'mic06', 'mic07', 'mic08', 'mic09', 'mic10']

macrocell_cols = ['mac01', 'mac02', 'mac03', 'mac04', 'mac05', 'mac06', 'mac07', 'mac08', 'mac09', 'mac10']

# Concatenating pca dataframes together, also including the 3 features related to the community information level:
gen_azdias = pd.concat([pd.DataFrame(azdias_pers_pca, columns = person_cols),
                        pd.DataFrame(azdias_hh_pca, columns = household_cols),
                        pd.DataFrame(azdias_mic_pca, columns = microcell_cols),
                        pd.DataFrame(azdias_mac_pca, columns = macrocell_cols),
                        azdias_com], \
                       axis = 1)

gen_azdias.head()

To better understand how the original features compose these components created through `PCA`, a function will be defined to return the most important features for each component:

In [None]:
# Defining function to show most important features for each component:
def analyze_component(component, top = 3):
    '''
    It returns the most important features related to the component.

    Inputs:
    component: integer indicating the number of the component to be analyzed;
    top: integer indicating the number of top features to return;
    '''
    # Checking on to which level of informations the component relates to:
    # PERSON component:
    if component in range(30):
        print('\nPERSON Component:')
        print('-----------------')
        pca = pca_pers
        features = list(azdias_pers.columns)

    # HOUSEHOLD component:
    elif component in range(30, 50):
        print('\nHOUSEHOLD Component:')
        print('--------------------')
        pca = pca_hh
        features = list(azdias_hh.columns)
        component = component - 30

    # MICROCELL component:
    elif component in range(50, 60):
        print('\nMICROCELL Component:')
        print('--------------------')
        pca = pca_mic
        features = list(azdias_mic.columns)
        component = component - 50

    # MACROCELL component:
    elif component in range(60, 70):
        print('\nMACROCELL Component:')
        print('--------------------')
        pca = pca_mac
        features = list(azdias_mac.columns)
        component = component - 60

    else:
        return(print('Component out of range (0-69)'))


    # Extracting features' weight on the desired component:
    weights = list(pca.components_[component])

    # Getting the index of ordered weights:
    ordered_idx = np.argsort(weights)[::-1]

    # Getting top indexes:
    top_pos_idx = ordered_idx[:top]
    top_neg_idx = ordered_idx[-top:]

    # Printing top positive weights on the component:
    print('\nComponent {}'.format(component))
    print('------------')
    print('\n* Top {} Positive Weights:'.format(top))
    print('--------------------------')
    for i in top_pos_idx:
        print('{}: {:.3f}'.format(features[i], weights[i]))

    # Printing top negative weights on the component:
    print('\n* Top {} Negative Weights:'.format(top))
    print('--------------------------')
    for j in top_neg_idx[::-1]:
        print('{}: {:.3f}'.format(features[j], weights[j]))

In [None]:
# Testing analyze_component:
analyze_component(0, top = 5)

Unfortunately, there was no explanation provided for the feature `CJT_TYP_6` (it's only known to be related to *customer journey typology*). However, as an example, the **Component 0** related to the **person** information level will be interpreted without this specific feature.

`ALTERSKATEGORIE_GROB` relates to age classification through prename analysis, where higher values represent higher ages, indicating that these components represent elder people. This aspect can be reinforced when analyzing the highest negative weight related to the `YOUTH_DECADE` component.

`YOUTH_DECADE` indicates in which decade the person lived his/her youth period. In other words, the lower the decade, the elder the person is. Basically, these two variables represent the same information, but their values are in opposite directions, confirming that this component represents elder people.

It can also be seen that `SEMIO_ERL` and `SEMIO_LUST` are important features that positively represent this component. `SEMIO_ERL` describes if the person is eventful oriented while `SEMIO_LUST`indicates if the person is sensual minded. Higher values indicate lower affinity with that specific characteristic (*1 - highest affinity*, *7 - lowest affinity*). It tells that this component represents elder people that are *not* eventful oriented, nor sensual minded.

On the oher hand, `SEMIO_TRADV` and `SEMIO_REL` represent important aspects in the opposite direction, indicating that the component represents people that are traditional-minded and religious. At the same time, they have a rational mind (`SEMIO_RAT`).

`RETOURTYP_BK_S_5.0` indicates that the return type of these people is classified as *determined minimal-returner*, and the `FINANZ_ALENGER` indicates a high correlation to the *investor* financial typology, maybe giving a hint that these people may also be associated with higher incomes.

#### 1.5.5 Defining the Number of Clusters<a name="nclusters"></a>

Before classifying the data in different clusters, it's necessary to find the optimal number of clusters. For that, the *Elbow Method* will be used:

In [None]:
# Defining function to create Elbow Method Visualization
def apply_elbow_method(df = gen_azdias):
    '''
    It returns a visualization that shows the sum of squared distances of samples to their closest cluster
    center for each attempt of number of clusters.

    Input:
    df: dataframe on which cluster analysis will be performed.
    '''
    # Defining number of clusters attempts:
    n_cluster = range(2, 32, 2)

    # Creating inertia list:
    wcss = list()

    # Perform K-Means for each attempt and extract its 'inertia' (sum of quared distances to their closest cluster):
    for n in n_cluster:
        kmeans = KMeans(n_clusters = n)
        kmeans.fit(df)
        wcss.append(kmeans.inertia_)

    # Creating the visualization:
    fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (13, 5))

    # Drawing lineplot:
    sns.lineplot(x = n_cluster, y = wcss,
                 markers = True, color = 'black').set(xlabel = 'Number of Clusters',
                                                      ylabel = 'Sum of Squared Distances',
                                                      title = "Elbow Method - Optimizing Number of Clusters Choice")
    sns.despine()
    fig.show()

In [None]:
# Checking Elbow visualization:
apply_elbow_method()

Unfortunately, there's no clear elbow indicating which would be the best number of clusters to be chosen. However, it's possible to see that the most expressive change in the inclination happens between 15 and 20 clusters.

With that, the clustering process will be built considering **18 clusters**.

#### 1.5.6 Applying Transformations on Customer Data<a name="transcustomer"></a>

Before proceeding to the clustering process, data transformation will be applied to the customers data:

In [None]:
# CAMEO_DEU feature engineering:
customers = transform_cameo_deu(customers)

In [None]:
# Applying data transformation on:
# 'person' info level:
customers_pers = transform_data(customers, info_level = 'person', cols = pers_cols, scaler = pers_scaler)

In [None]:
# 'household' info level:
customers_hh = transform_data(customers, info_level = 'household', cols = hh_cols, scaler = hh_scaler)

In [None]:
# 'microcell' info level:
customers_mic = transform_data(customers, info_level = 'microcell', cols = mic_cols, scaler = mic_scaler)

In [None]:
# 'macrocell' info level:
customers_mac = transform_data(customers, info_level = 'macrocell', cols = mac_cols, scaler = mac_scaler)

In [None]:
# 'community' info level:
customers_com = transform_data(customers, info_level = 'community', cols = com_cols, scaler = com_scaler)

In [None]:
# PCA transformation:

#_______________________________________________________
# PERSON:

# Applyting transformation on customers dataset:
customers_pers_pca = pca_pers.transform(customers_pers)

#_______________________________________________________
# HOUSEHOLD:

# Applyting transformation on customers dataset:
customers_hh_pca = pca_hh.transform(customers_hh)

#_______________________________________________________
# MICROCELL:

# Applyting transformation on customers dataset:
customers_mic_pca = pca_mic.transform(customers_mic)

#_______________________________________________________
# MACROCELL:

# Applyting transformation on customers dataset:
customers_mac_pca = pca_mac.transform(customers_mac)

In [None]:
# Concatenating pca dataframes together, also including the 3 features related to the community information level:
gen_customers = pd.concat([pd.DataFrame(customers_pers_pca, columns = person_cols),
                           pd.DataFrame(customers_hh_pca, columns = household_cols),
                           pd.DataFrame(customers_mic_pca, columns = microcell_cols),
                           pd.DataFrame(customers_mac_pca, columns = macrocell_cols),
                           customers_com], \
                          axis = 1)

gen_customers.head()

#### 1.5.7 Clustering<a name="clustering"></a>

In [None]:
# Training KMeans algorithm and making cluster predictions:
# Instatiating KMeans algorithm for n clusters:
kmeans = KMeans(n_clusters = 18, random_state = 101)

# Fitting to df1 data:
kmeans.fit(gen_azdias)

# Predicting clusters on df1:
azdias_clusters = kmeans.predict(gen_azdias)

# Predicting clusters on df2:
customers_clusters = kmeans.predict(gen_customers)

#### 1.5.8 Evaluating Clusters<a name="clustereval"></a>

Once each observation was assigned to its correspondent cluster, the job now is to check which clusters are proportionally more representative among clients than in the general population. The same for the clusters that happen more frequently in the general population in comparison to the customers' group.

With that, it will be possible to understand the different combinations of features that result in a person being more likely to become a client or the other way around.

In [None]:
# Defining a function to create a comparison barplot:
def compare_cluster_occurance(customers = customers_clusters, general = azdias_clusters):
    '''
    It creates a bor plot comparing the percentages related to each one the clusters borh for customers and
    the general population.

    Inputs:
    customers: array containing the assigned cluster for each customer observation;
    general: rray containing the assigned cluster for each general population observation.
    '''
    # Counting cluster occurences in customer population:
    customer_cluster_occ = np.bincount(customers)

    # Counting cluster occurences in general population:
    gen_pop_cluster_occ = np.bincount(general)

    # Creating dataframe with cluster proportional counting:
    cluster_df = pd.DataFrame(columns = ['general_population', 'customers'])
    n_pop = np.sum(gen_pop_cluster_occ)
    n_cust = np.sum(customer_cluster_occ)
    cluster_df.general_population = [(count / n_pop) * 100 for count in gen_pop_cluster_occ]
    cluster_df.customers = [(count / n_cust) * 100 for count in customer_cluster_occ]
    cluster_df['Cluster'] = cluster_df.index.values

    # Transforming df to melted version:
    melted_df = pd.melt(cluster_df, id_vars = ['Cluster'] , value_vars = ['general_population', 'customers'], \
                        var_name = 'Group', value_name ='Percentage')

    palette = {'customers': 'darkcyan', 'general_population': 'springgreen'}

    fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (16.5, 5))
    sns.barplot(x = 'Cluster', y = 'Percentage', hue = 'Group', data = melted_df, palette = palette, ax = ax).set(
        title = 'Comparison Between Customers and General Population - Clusters')
    sns.despine(left=True, top = True)
    fig.show()

    # Computing percentage differences:
    perc_diff = [((cluster_df.customers[i] - cluster_df.general_population[i]) / cluster_df.general_population[i]) * 100 \
                 for i in range(cluster_df.shape[0])]
    # Getting ordered index:
    ordered_idx = np.argsort(perc_diff)[::-1]

    # Printing top 5 customers' clusters with highest representativity in comparison to general population:
    print('Top 5 Overrepresented Clusters among Customers:')
    print('-----------------------------------------------\n')
    for i in ordered_idx[:5]:
        print('Cluster {}: increased {:.1f} %'.format(i, perc_diff[i]))

    # Printing top 5 customers' clusters with lowest representativity in comparison to general population:
    print('\nTop 5 Underrepresented Clusters among Customers:')
    print('-----------------------------------------------\n')
    for i in ordered_idx[-5:][::-1]:
        print('Cluster {}: decreased {:.1f} %'.format(i, perc_diff[i]))

In [None]:
compare_cluster_occurance()

To better understand what these clusters represent, `cluster center` will provide the most important components related to that specific cluster, and then it will be possible to go back to those specific components to understand the features that better represent them.

That way, it will be possible to develop a cluster overview.

In [None]:
# Creating a function that, given one specific cluster, returns the most important features associated with it:
def cluster_most_important_features(cluster, kmeans = kmeans, top_comp = 3):
    '''
    Given a cluster, it searches for the most important components, and through them, search for
    the most important features related to those components.

    Input:
    cluster: integer indicating the target cluster (0-17);
    kmeans: trained kmeans model
    top_com: number of most important components to be analyzed.
    '''
    # Getting cluster center:
    cluster_centers = kmeans.cluster_centers_[cluster]

    # Getting more important components:
    pos_idx = np.argsort(cluster_centers)[::-1]
    pos_idx = pos_idx[:top_comp]

    # Analyzing components and extracting their most important features:
    print('Principal Components with Highest Impact on Cluster {}:'.format(cluster))
    print('--------------------------------------------------------')
    for comp in pos_idx:
        analyze_component(comp)

#### 1.5.8.1 Overrepresented Clusters<a name="over"></a>

**Cluster 2 - GREEN DREAMERS**

In [None]:
# Analyzing Cluster 2:
cluster_most_important_features(2, top_comp = 3)

`CLUSTER 2` is the one with the highest overrepresentation, being proportionally over 205% more representative in customers than in the general population.

Analyzing the most important components, that's how this cluster could be described:
* their vacation habits can be classified as *nature fans*, already emphasizing the green tendency as an important aspect present in people represented by this cluster;

* the green aspect is reinforced with the *avant-garde* aspect, relating this cluster to people with the avant-garde mindset, which also relates them to the *green avant-garde* movements;

* they don't seem to be familiar minded;

* this cluster is representing mostly men that are not single, not critical-minded but are dreamers and social-minded.

**Cluster 9 - HIGH-SOCIETY TRADITIONAL ELDERS**

In [None]:
# Analyzing Cluster 9:
cluster_most_important_features(9, top_comp = 3)

`CLUSTER 9` is also about 205% more representative in customers than in the general population.

Analyzing the most important components, that's how this cluster could be described:
* they are in advanced life stages, being represented as families or multiperson households, and also related to higher incomes;

* their social status can be classified as independents, house owners and top-earners;

* they are not dreamers, nor eventful oriented, but they can be represented as being critical and traditional-minded, as well as religious;

* the cluster describes elder people.

**Cluster 4 - EMPTY NEST, FULL WALLET**

In [None]:
# Analyzing Cluster 4:
cluster_most_important_features(4, top_comp = 3)

`CLUSTER 4` is over 100% more representative in customers than in the general population.

Analyzing the most important components, that's how this cluster could be described:
* it represents mostly average to high-income couples;

* it could also represent house owner couples and top earner-couples of higher age;

* the third component brings aspects seen before, as being  traditional-minded, and religious, but not eventful oriented;

* it's possible to say that some aspects seen before are now applied mostly to couples with high-incomes from different ages, and also mature and top earner-couples.

#### 1.5.8.2 Underrepresented Clusters<a name="under"></a>

**Cluster 0 - LESS AFFORTUNATE BEGINNERS**

In [None]:
# Analyzing Cluster 0:
cluster_most_important_features(0, top_comp = 3)

`CLUSTER 0` is 95% less representative in customers than in the general population.

Analyzing the most important components, that's how this cluster could be described:
* it represents people that could be classified as less affluent or poorer, mostly indicating pre-family couples and singles or young couples with children;

* their household incomes can be considered as lower or very low incomes, and their insurance typology is mostly classified as *social safety-driven*;

* in terms of health and shopping typologies, they could be described as *sanitary affine* and *demanding shoppers*;

* the microcell component indicates areas with a high share of cares built before 1994 that had two or more pre-owners, creating the picture of people with lower incomes that live in poorer areas.

**Cluster 16 - MULTI-GENERATION MONEY SAVERS**

In [None]:
# Analyzing Cluster 9:
cluster_most_important_features(16, top_comp = 3)

`CLUSTER 9` is also about 95% less representative in customers than in the general population.

Analyzing the most important components, that's how this cluster could be described:
* this cluster describes people with low-income and average earners of higher age from multiperson households, and families classified as *two-generation household*;

* there is also a strong aspect of singles with low-incomes or average-earners families, and also average earners of younger age from multiperson households. This profile has a tendency of being related to people living in the Westside;

* their financial typology to be classified as *low financial interest* or *money-savers*.

**Cluster 14 - SECOND-HAND CAR CELLS**

In [None]:
# Analyzing Cluster 14:
cluster_most_important_features(14, top_comp = 3)

`CLUSTER 14` is about 83% less representative in customers than in the general population.

Analyzing the most important components, that's how this cluster could be described:
* it represents people that could be classified as less affluent or poorer, mostly indicating pre-family couples and singles or young couples with children;

* the microcell component indicates areas with a high share of cares built before 1994 that had two or more pre-owners, creating the picture of people with lower incomes that live in poorer areas;

* it also describes areas with a high share of family houses, with a very low share of cars per household, and also close to city centers;

* most of these aspects were already highlighted in other clusters, but in this case, it seems to emphasize more the micro and macrocells aspects than the ones related to the person.

