In [0]:
####################################################################################################################################
####################################################################################################################################
#############################################################LIBRARIES##############################################################

In [0]:
%pip install unidecode

Python interpreter will be restarted.
Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
Installing collected packages: unidecode
Successfully installed unidecode-1.3.2
Python interpreter will be restarted.


In [0]:
import pandas as pd
import numpy as np
from datetime import datetime
import time
from copy import deepcopy

import re
from unidecode import unidecode

from sklearn.preprocessing import StandardScaler

In [0]:
####################################################################################################################################
####################################################################################################################################
#######################################################FUNCTIONS AND CLASSES########################################################

In [0]:
####################################################################################################################################
#############################################################UTILS##################################################################

In [0]:
# Function that converts epoch into date:
def epoch_to_date(x):
    if np.isnan(x):
        return np.NaN

    else:
        str_datetime = time.strftime('%d %b %Y %H:%M:%S', time.localtime(x/1000))
        dt = datetime.strptime(str_datetime, '%d %b %Y %H:%M:%S')
        return dt

In [0]:
# Function that splits data into train and test set:
def train_test_split(dataframe, preserve_date=False, date_var='date', test_ratio=0.5, shuffle=False, seed=None):
    """
    Function that splits data into train and test set.
    
    :param dataframe: complete set of data.
    :type dataframe: dataframe.
    :param preserve_date: indicates whether to perform split based on volume of data, but not mingling instances
    from the same date.
    :type preserve_date: boolean.
    :param date_var: name of the date variable to consider during the split.
    :type date_var: string.
    :param seed: seed for shuffle.
    :type seed: integer.
    :param test_ratio: proportion of data to be allocated into test set.
    :type test_ratio: float.
    :param shuffle: indicates whether to shuffle data previously to the split.
    :type shuffle: boolean.
    
    :return: training and test dataframes.
    :rtype: tuple.
    """
    df = dataframe.copy()
    df.reset_index(drop=True, inplace=True)
    
    if shuffle:
        df = df.sample(len(df), random_state=seed)
    
    if preserve_date:
        # Number of instances by date:
        orders_by_date = pd.DataFrame(data={
            'date': df.date.apply(lambda x: x.date()).value_counts().index,
            'freq': df.date.apply(lambda x: x.date()).value_counts().values}).sort_values('date')

        # Accumulated number of instances by date:
        orders_by_date['acum'] = np.cumsum(orders_by_date.freq)
        orders_by_date['acum_share'] = [a/orders_by_date['acum'].max() for a in orders_by_date['acum']]

        # Date gathering 1 - test_ratio of data:
        last_date_train = orders_by_date.iloc[np.argmin(abs(orders_by_date['acum_share'] - (1 - test_ratio)))]['date']

        # Train-test split:
        df_test = df[df.date.apply(lambda x: x.date()) > last_date_train]
        df_train = df[df.date.apply(lambda x: x.date()) <= last_date_train]
    
    else:
        # Indexes for training and test data:
        test_indexes = [True if i > int(len(df)*(1 - test_ratio)) else False for i in range(len(df))]
        train_indexes = [True if i==False else False for i in test_indexes]

        # Train-test split:
        df_train = df.iloc[train_indexes, :]
        df_test = df.iloc[test_indexes, :]
    
    return (df_train, df_test)

In [0]:
# Function that produces a dataframe with frequency of features by class and returns lists with features names by class:
def classify_variables(dataframe, vars_to_drop=[], drop_excessive_miss=True, excessive_miss=0.95,
                       drop_no_var=True, validation_data=None, test_data=None):
    """
    Function that produces a dataframe with frequency of features by class and returns lists with features names by class.

    :param dataframe: reference data.
    :type dataframe: dataframe.

    :param vars_to_drop: list of support columns.
    :type vars_to_drop: list.

    :param drop_excessive_miss: flag indicating whether columns with excessive missings should be dropped out.
    :type drop_excessive_miss: boolean.

    :param excessive_miss: share of missings above which columns are dropped from the dataframes.
    :type excessive_miss: float.

    :param drop_no_var: flag indicating whether columns with no variance should be dropped out.
    :type drop_no_var: boolean.

    :param validation_data: additional data.
    :type validation_data: dataframe.

    :param test_data: additional data.
    :type test_data: dataframe.

    :return: dataframe and lists with features by class.
    :rtype: dictionary.
    """
    print(f'Initial number of features: {dataframe.drop(vars_to_drop, axis=1).shape[1]}.')

    if drop_excessive_miss:
        # Dropping features with more than 95% of missings in the training data:
        excessive_miss_train = [c for c in dataframe.drop(vars_to_drop, axis=1) if
                                sum(dataframe[c].isnull())/len(dataframe) > excessive_miss]

        if len(excessive_miss_train) > 0:
            dataframe.drop(excessive_miss_train, axis=1, inplace=True)

            if validation_data is not None:
                validation_data.drop(excessive_miss_train, axis=1, inplace=True)
                
            if test_data is not None:
                test_data.drop(excessive_miss_train, axis=1, inplace=True)

        print(f'{len(excessive_miss_train)} features were dropped for excessive number of missings!')
        
    # Data type of each variable:
    type_vars = dict(zip(dataframe.drop(vars_to_drop, axis=1).dtypes.index,
                         dataframe.drop(vars_to_drop, axis=1).dtypes.values))
    
    # Classifying features:
    cat_vars = []
    binary_vars = []
    cont_vars = []

    # Loop over variables:
    for v in type_vars.keys():
        # Categorical features:
        if type_vars[v] == object:
            cat_vars.append(v)

        # Numerical features:
        else:
            # Binary variables:
            if (dataframe[v].nunique() == 2) & (sorted(dataframe[v].unique()) == [0, 1]):
                binary_vars.append(v)

            # Continuous variables:
            else:
                cont_vars.append(v)
        
    if drop_no_var:
        # Dropping features with no variance in the training data:
        no_variance = [c for c in dataframe.drop(vars_to_drop, axis=1).drop(cat_vars,
                                                                         axis=1) if np.nanvar(dataframe[c])==0]

        if len(no_variance) > 0:
            dataframe.drop(no_variance, axis=1, inplace=True)
            if validation_data is not None:
                validation_data.drop(no_variance, axis=1, inplace=True)
                
            if test_data is not None:
                test_data.drop(no_variance, axis=1, inplace=True)

        print(f'{len(no_variance)} features were dropped for having no variance!')
        
    print(f'{dataframe.drop(vars_to_drop, axis=1).shape[1]} remaining features.')
    print('\n')
    
    # Dataframe presenting the frequency of features by class:
    feats_assess = pd.DataFrame(data={
        'class': ['cat_vars', 'binary_vars', 'cont_vars', 'vars_to_drop'],
        'frequency': [len(cat_vars), len(binary_vars), len(cont_vars), len(vars_to_drop)]
    })
    feats_assess.sort_values('frequency', ascending=False, inplace=True)
    
    # Dictionary with outputs from the function:
    feats_assess_dict = {
        'feats_assess': feats_assess,
        'cat_vars': cat_vars,
        'binary_vars': binary_vars,
        'cont_vars': cont_vars
    }
    
    if drop_excessive_miss:
        feats_assess_dict['excessive_miss_train'] = excessive_miss_train

    if drop_no_var:
        feats_assess_dict['no_variance'] = no_variance
    
    return feats_assess_dict

In [0]:
# Function that produces an assessment of the occurrence of missing values:
def assessing_missings(dataframe):
    """
    Function that produces an assessment of the occurrence of missing values.

    :param dataframe: reference data.
    :type dataframe: dataframe.

    :return: dataframe with frequency and share of missings by feature.
    :rtype: dataframe.
    """
    # Dataframe with the number of missings by feature:
    missings_dict = dataframe.isnull().sum().sort_values(ascending=False).to_dict()

    missings_df = pd.DataFrame(data={
        'feature': list(missings_dict.keys()),
        'missings': list(missings_dict.values()),
        'share': [m/len(dataframe) for m in list(missings_dict.values())]
    })

    print('\033[1mNumber of features with missings:\033[0m {}'.format(sum(missings_df.missings > 0)) +
          ' out of {} features'.format(len(missings_df)) +
          ' ({}%).'.format(round((sum(missings_df.missings > 0)/len(missings_df))*100, 2)))
    print('\033[1mAverage number of missings:\033[0m {}'.format(int(missings_df.missings.mean())) +
          ' out of {} observations'.format(len(dataframe)) +
          ' ({}%).'.format(round((int(missings_df.missings.mean())/len(dataframe))*100,2)))
    
    return missings_df

In [0]:
# Function that assess the number of missings in a dataframe:
def missings_detection(dataframe, name='df', var=None):
    """"
    Function that assess the number of missings in a dataframe

    :param dataframe: dataframe for which missings should be detected.
    :type dataframe: dataframe.
    
    :param name: name of dataframe.
    :type name: string.
    
    :param var: name of variable whose missings should be detected (optional).
    :type var: string.

    :return: prints the number of missings when there is a positive amount of them.
    """

    if var:
        num_miss = dataframe[var].isnull().sum()
        if num_miss > 0:
            print(f'Problem - There are {num_miss} missings for "{var}" in dataframe {name}.')

    else:
        num_miss = dataframe.isnull().sum().sum()
        if num_miss > 0:
            print(f'Problem - Number of overall missings detected in dataframe {name}: {num_miss}.')

In [0]:
# Function that forces consistency between reference (training) and additional (validation, test) data:
def data_consistency(dataframe, *args, **kwargs):
    """
    Function that forces consistency between reference (training) and additional (validation, test) data:

    The keyword arguments are expected to be dataframes whose argument names indicate the nature of the passed data. For instance,
    'test_data=df_test' would be a dataframe with test instances.

    :param dataframe: reference data.
    :type dataframe: dataframe.

    :return: dataframes with consistent data.
    :rtype: dictionary.
    """
    consistent_data = {}
    
    for d in kwargs.keys():
        consistent_data[d] = kwargs[d].copy()
        
        # Columns absent in reference data:
        absent_train = [c for c in kwargs[d].columns if c not in dataframe.columns]
        
        # Columns absent in additional data:
        absent_test = [c for c in dataframe.columns if c not in kwargs[d].columns]
        
        # Creating absent columns:
        for c in absent_test:
            consistent_data[d][c] = 0
    
        # Preserving consistency between reference and additional data:
        consistent_data[d] = consistent_data[d][dataframe.columns]
        
        # Checking consistency:
        if sum([1 for r, a in zip(dataframe.columns, consistent_data[d].columns) if r != a]):
            print('Problem - Reference and additional datasets are inconsistent!')
        else:
            print(f'Training and {d.replace("_", " ")} are consistent with each other.')
    
    return consistent_data

In [0]:
# Function for cleaning texts:
def text_clean(text, lower=True):
    if pd.isnull(text):
        return text
    
    else:
        text = str(text)

        # Removing accent:
        text_cleaned = unidecode(text)
        # try:
        #     text_cleaned = unidecode(text)
        # except AttributeError as error:
        #     print(f'Error: {error}.')

        # Removing extra spaces:
        text_cleaned = re.sub(' +', ' ', text_cleaned)
        
        # Removing spaces before and after text:
        text_cleaned = str.strip(text_cleaned)
        
        # Replacing spaces:
        text_cleaned = text_cleaned.replace(' ', '_')
        
        # Deleting signs:
        for m in '.,;+-!@#$%¨&*()[]{}\\/|':
            if m in text_cleaned:
                text_cleaned = text_cleaned.replace(m, '')

        # Setting text to lower case:
        if lower:
            text_cleaned = text_cleaned.lower()

        return text_cleaned

In [0]:
####################################################################################################################################
######################################################TRANSFORMATIONS###############################################################

In [0]:
class log_transformation(object):
    """Applies function to log-transform all variables in a dataframe except for those
    explicitly declared. Returns the dataframe with selected variables log-transformed
    and their respective names changed to 'L#PREVIOUS_NAME()'."""

    def __init__(self, not_log):
        self.not_log = not_log
        
    def transform(self, data):
        # Function that applies natural logarithm to numerical variables:
        def log_func(x):
            """Since numerical features are not expected to assume negative values here, and since, after a sample
            assessment, only a few negative values were identified for just a few variables, suggesting the occurrence of
            technical issues for such observations, any negative values will be truncated to zero when performing
            log-transformation."""
            if x < 0:
                new_value = 0
            else:
                new_value = x

            transf_value = np.log(new_value + 0.0001)

            return transf_value
        
        # Redefining names of columns:
        new_col = []
        log_vars = []
        
        self.log_transformed = data
        
        # Applying logarithmic transformation to selected variables:
        for f in list(data.columns):
            if f in self.not_log:
                new_col.append(f)
            else:
                new_col.append('L#' + f)
                log_vars.append('L#' + f)
                self.log_transformed[f] = data[f].apply(log_func)

        self.log_transformed.columns = new_col
        
        print('\033[1mNumber of numerical variables log-transformed:\033[0m ' + str(len(log_vars)) + '.')

In [0]:
class standard_scale(object):
    """Fits and transforms all variables in a dataframe, except for those explicitly defined to not scale.
    Uses 'StandardScaler' from sklearn and returns not only scaled data, but also in its dataframe original
    format. If test data is provided, then their values will be standardized using means and variances from
    train data."""
    
    def __init__(self, not_stand):
        self.not_stand = not_stand
    
    def scale(self, train, test=None):
        # Creating standardizing object:
        scaler = StandardScaler()
        
        # Calculating means and variances:
        scaler.fit(train.drop(self.not_stand, axis=1))
        
        # Standardizing selected variables:
        self.train_scaled = scaler.transform(train.drop(self.not_stand, axis=1))
        
        # Transforming data into dataframe and concatenating selected and non-selected variables:
        self.train_scaled = pd.DataFrame(data=self.train_scaled,
                                         columns=train.drop(self.not_stand, axis=1).columns)
        self.train_scaled.index = train.index
        self.train_scaled = pd.concat([train[self.not_stand], self.train_scaled], axis=1)
        
        # Test data:
        if test is not None:
            # # Standardizing selected variables:
            self.test_scaled = scaler.transform(test.drop(self.not_stand, axis=1))
            
            # Transforming data into dataframe and concatenating selected and non-selected variables:
            self.test_scaled = pd.DataFrame(data=self.test_scaled,
                                            columns=test.drop(self.not_stand, axis=1).columns)
            self.test_scaled.index = test.index
            self.test_scaled = pd.concat([test[self.not_stand], self.test_scaled], axis=1)

In [0]:
class one_hot_encoding(object):
    """
    Arguments for initialization:
        'features': list of categorical features whose categories should be selected.
        'variance_param': parameter for selection based on the variance of a given dummy variable.
    Methods:
        'create_dummies': for a given training data ('categorical_train'), performs selection of dummies based on variance criterium.
        Then, creates the same set of dummy variables for test data ('categorical_test').
    Output objects:
        'self.categorical_features': list of categorical features whose categories should be selected.
        'self.variance_param': parameter for selection based on the variance of a given dummy variable.
        'self.dummies_train': dataframe with selected dummies for training data.
        'self.dummies_test': dataframe for test data with dummies selected from training data.
        'self.categories_assessment': dictionary with number of overall categories, number of selected categories, and selected
        categories for each categorical feature.
    """
    def __init__(self, categorical_features,  variance_param = 0.01):  
        self.categorical_features = categorical_features
        self.variance_param = variance_param

    def create_dummies(self, categorical_train, categorical_test = None):
        self.dummies_train = pd.DataFrame(data=[])
        self.dummies_test = pd.DataFrame(data=[])
        self.categories_assessment = {}
        
        # Loop over categorical features:
        for f in self.categorical_features:
            # Training data:
            # Creating dummy variables:
            dummies_cat = pd.get_dummies(categorical_train[f]) 
            dummies_cat.columns = ['C#' + f + '#' + str.upper(str(c)) for c in dummies_cat.columns]

            # Selecting dummies_cat depending on their variance:
            selected_cat = [d for d in dummies_cat.columns if dummies_cat[d].var() > self.variance_param]

            # Dataframe with dummy variables for all categorical features (training data):
            self.dummies_train = pd.concat([self.dummies_train, dummies_cat[selected_cat]], axis=1)
            
            # Assessing categories:
            self.categories_assessment[f] = {
                "num_categories": len(dummies_cat.columns),
                "num_selected_categories": len(selected_cat),
                "selected_categories": selected_cat
            }

            if categorical_test is not None:
                # Test data:
                dummies_cat = pd.get_dummies(categorical_test[f])
                dummies_cat.columns = ['C#' + f + '#' + str.upper(str(c)) for c in dummies_cat.columns]

                # Checking if all categories selected from training data also exist for test data:
                for c in selected_cat:
                    if c not in dummies_cat.columns:
                        dummies_cat[c] = [0 for i in range(len(dummies_cat))]

                # Dataframe with dummy variables for all categorical features (test data):
                self.dummies_test = pd.concat([self.dummies_test, dummies_cat[selected_cat]], axis=1)

                # Preserving columns order as the same for training data:
                self.dummies_test = self.dummies_test[list(self.dummies_train.columns)]

In [0]:
# Function that recriates original missing values from dummy variable of missing value status:
def recreate_missings(var, missing_var):
    """
    Arguments:
        'var': variable (series, array, or list) to impute missing values.
        'missing_var': variable (series, array, or list) that indicates missing data.
        Attention: both arguments should have the same lenght and should have the same index.
    Outputs:
        A list with missing values recreated (if any exists in 'missing_var').
    """
    var_list = list(var)
    missing_var_list = list(missing_var)
    new_values = []
    
    # Loop over observations:
    for i in range(len(var_list)):
        if missing_var_list[i] == 1:
            new_values.append(np.NaN)
        else:
            new_values.append(var_list[i])
    
    return new_values

In [0]:
# Function that treats missing values by imputing 0 whenever they are found:
def impute_missing(var):
    """
    Arguments:
        'var': variable (series, array, or list) whose missing values should be replaced by 0.
    Outputs:
        A dictionary containing a list of values for the variable after missing values treatment, and a list of
        missing value status.
    """
    var_list = list(var)
    new_values = []
    missing_var = []
    
    # Loop over observations:
    for value in var_list:
        if np.isnan(value):
            new_values.append(0)
            missing_var.append(1)
        else:
            new_values.append(value)
            missing_var.append(0)
    
    return {'var': new_values, 'missing_var': missing_var}

In [0]:
# Function that applies log-transformation:
def applying_log_transf(dataframe, not_log):
    """
    Function that applies log-transformation based upon the 'log_transformation' class.

    :param dataframe: reference data.
    :type dataframe: dataframe.

    :param not_log: list with features names that should not be log-transformed.
    :type not_log: list.

    :return: dataframe with numerical variables log-transformed.
    :rtype: dataframe.
    """
    log_dataframe = dataframe.copy()
    
    # Assessing missing values (before logarithmic transformation):
    num_miss = log_dataframe.isnull().sum().sum()
    
    # Applying the log-transformation:
    log_transf = log_transformation(not_log=not_log)
    log_transf.transform(log_dataframe)
    log_dataframe = log_transf.log_transformed

    # Assessing missing values (after logarithmic transformation):
    num_miss_log = log_dataframe.isnull().sum().sum()

    # Checking consistency in the number of missings:
    if num_miss_log != num_miss:
        print('Problem - Inconsistent number of overall missings!')
        
    # Assessing consistency of dimensions:
    if not log_dataframe.shape == dataframe.shape:
        print('Problem - Inconsistent dimensions!')
        print(f'Shape before scaling: {log_dataframe.shape}.\nShape after scaling: {dataframe.shape}.')
    
    return log_dataframe

In [0]:
# Function that applies the standard scaling transformation:
def applying_standard_scale(training_data, not_stand, *args, **kwargs):
    """
    Function that applies the standard scaling transformation based upon the 'standard_scale' class.

    The keyword arguments are expected to be dataframes whose argument names indicate the nature of the passed data. For instance,
    'test_data=df_test' would be a dataframe with test instances.

    :param training_data: reference data.
    :type training_data: dataframe.

    :param not_stand: list with features names that should not be standardized.
    :type not_stand: list.

    :return: dataframes with numerical variables standardized.
    :rtype: dictionary.
    """
    scaled_data = {}
    
    print('\033[1mStandard scaling training data...\033[0m')

    stand_scale = standard_scale(not_stand = not_stand)
    stand_scale.scale(train = training_data, test = None)

    scaled_data['training_data'] = stand_scale.train_scaled

    # Assessing consistency of dimensions:
    if not scaled_data['training_data'].shape == training_data.shape:
        print('Problem - Inconsistent dimensions!')
        print(f'Shape before scaling: {training_data.shape}.\nShape after scaling: {scaled_data["training_data"].shape}.')

    # Assessing consistency of missing values:
    num_miss = training_data.isnull().sum().sum()
    num_miss_scaled = scaled_data['training_data'].isnull().sum().sum()

    if num_miss_scaled != num_miss:
        print('Problem - Inconsistent number of overall missings!')

    # Loop over additional data:
    for d in kwargs.keys():
        print(f'\033[1mStandard scaling {d.replace("_", " ")}...\033[0m')
        stand_scale = standard_scale(not_stand = not_stand)
        stand_scale.scale(train = training_data, test = kwargs[d])

        scaled_data[d] = stand_scale.test_scaled

        # Assessing consistency of dimensions:
        if not scaled_data[d].shape == kwargs[d].shape:
            print('Problem - Inconsistent dimensions!')
            print(f'Shape before scaling: {training_data.shape}.\nShape after scaling: {scaled_data[d].shape}.')

        # Assessing consistency of missing values:
        num_miss = kwargs[d].isnull().sum().sum()
        num_miss_scaled = scaled_data[d].isnull().sum().sum()

        if num_miss != num_miss_scaled:
            print('Problem - Inconsistent number of overall missings!')
    
    return scaled_data

In [0]:
# Function that treats missing values:
def treating_missings(dataframe, cat_vars, drop_vars):
    """
    Function that treats missing values both from categorical and numerical features. The last set of features have their missings
    treated using the function 'impute_missing'.

    :param dataframe: reference data.
    :type dataframe: dataframe.

    :param cat_vars: list with categorical variables.
    :type cat_vars: list.

    :param drop_vars: list with support variables.
    :type drop_vars: list.

    :return: dataframe with treated missing values.
    :rtype: dataframe.
    """
    treated_dataframe = dataframe.copy()
    
    num_miss = treated_dataframe.isnull().sum().sum()
    
    # Loop over categorical features:
    num_miss_cat_treat = 0
    for f in cat_vars:
        treated_dataframe[f] = ['missing_value' if pd.isnull(x) else x for x in treated_dataframe[f]]
        num_miss_cat_treat += sum([1 for x in treated_dataframe[f] if x == 'missing_value'])
    
    # Loop over non-categorical features:
    for f in treated_dataframe.drop(drop_vars, axis=1).drop(cat_vars, axis=1):
        # Checking if there is missing values for a given feature:
        if treated_dataframe[f].isnull().sum() > 0:
            check_missing = impute_missing(treated_dataframe[f])
            treated_dataframe[f] = check_missing['var']
            treated_dataframe['NA#' + f.replace('L#', '')] = check_missing['missing_var']

    num_miss_treat = int(sum([sum(treated_dataframe[f]) for f in treated_dataframe.columns if 'NA#' in f]))
    num_miss_treat = num_miss_treat + num_miss_cat_treat

    if num_miss_treat != num_miss:
        print('Problem - Inconsistent number of overall missings!')
        print(f'Number of missings before treatment: {num_miss}.')
        print(f'Number of missings after treatment: {num_miss_treat}.')

    if treated_dataframe.isnull().sum().sum() > 0:
        print('Problem - Number of overall missings detected: ' +
              str(treated_dataframe.isnull().sum().sum()) + '.')
    
    return treated_dataframe

In [0]:
# Function that applies one-hot encoding transformation over categorical variables:
def applying_one_hot(training_data, cat_vars, variance_param=0.01, *args, **kwargs):
    """
    Function that applies one-hot encoding transformation over categorical variables based upon the 'one_hot_encoding' class.

    The keyword arguments are expected to be dataframes whose argument names indicate the nature of the passed data. For instance,
    'test_data=df_test' would be a dataframe with test instances.

    :param training_data: reference data.
    :type training_data: dataframe.
    
    :param cat_vars: list with categorical variables.
    :type cat_vars: list.

    :return: dataframes with categorical variables transformed into dummy variables.
    :rtype: dictionary.
    """
    dummies_df = {}
    transf_data = {}
    
    # Create object for one-hot encoding:
    categorical_transf = one_hot_encoding(categorical_features = cat_vars, variance_param = variance_param)

    # Treating texts:
    for f in cat_vars:
        training_data[f] = training_data[f].apply(text_clean)

    if kwargs:
        for d in kwargs:
            # Treating texts:
            for f in cat_vars:
                kwargs[d][f] = kwargs[d][f].apply(text_clean)

            # Creating dummies:
            categorical_transf.create_dummies(categorical_train = training_data[cat_vars],
                                              categorical_test = kwargs[d][cat_vars])

            # Additional data:
            dummies_df[d] = categorical_transf.dummies_test
            dummies_df[d].index = kwargs[d].index

            # Concatenating dummy variables with remaining columns and dropping out original categorical features:
            transf_data[d] = pd.concat([kwargs[d], dummies_df[d]], axis=1)
            transf_data[d].drop(cat_vars, axis=1, inplace=True)

    else:
        # Creating dummies:
        categorical_transf.create_dummies(categorical_train = training_data[cat_vars])
        
    # Training data:
    dummies_df['training_data'] = categorical_transf.dummies_train
    dummies_df['training_data'].index = training_data.index

    # Concatenating dummy variables with remaining columns and dropping out original categorical features:
    transf_data['training_data'] = pd.concat([training_data, dummies_df['training_data']], axis=1)
    transf_data['training_data'].drop(cat_vars, axis=1, inplace=True)
    
    print(f'\033[1mNumber of categorical features:\033[0m {len(cat_vars)}')
    print(f'\033[1mNumber of overall selected dummies:\033[0m {dummies_df["training_data"].shape[1]}.')
    
    return transf_data

In [0]:
####################################################################################################################################
########################################################PRE-PROCESS#################################################################

In [0]:
# Function that applies distinct functions and classes in order to pre-process training and test data:
def pre_process(training_data, test_data, vars_to_drop, log_transform=True, standardize=True):
    """
    Function that applies distinct functions and classes in order to pre-process training and test data.
    
    The implemented procedures are log-transformation and standard scaling of numerical variables, missing values
    treatment, and one-hot encoding for transforming categorical variables.
    
    :param training_data: training data.
    :type training_data: dataframe.
    :param test_data: test data.
    :type test_data: dataframe.
    :param vars_to_drop: collection of variables that should not be considered during data pre-processing.
    :type vars_to_drop: list.
    :param log_transform: indicates whether to log-transform numerical variables.
    :type log_transform: boolean.
    :param standardize: indicates whether to standard scale numerical variables.
    :type standardize: boolean.
    
    :return: training and test data pre-processed.
    :rtype: tuple.
    """
    
    df_train = training_data.copy()
    df_test = test_data.copy()
    drop_vars = deepcopy(vars_to_drop)
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\033[1mCLASSIFYING FEATURES AND EARLY SELECTION\033[0m')
    print('\n')
    
    class_variables = classify_variables(dataframe=df_train, vars_to_drop=drop_vars, test_data=df_test)
    
    # Lists of variables:
    cat_vars = class_variables['cat_vars']
    binary_vars = class_variables['binary_vars']
    cont_vars = class_variables['cont_vars']

    print('---------------------------------------------------------------------------------------------------------')
    print('\n')
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\033[1mASSESSING MISSING VALUES\033[0m')
    print('\n')
    
    # Assessing missing values:
    print('\033[1mTraining data:\033[0m')
    missings_train = assessing_missings(dataframe=df_train)
    print('\n\033[1mTest data:\033[0m')
    missings_test = assessing_missings(dataframe=df_test)
    print('\n')
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\n')
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\033[1mAPPLYING LOGARITHMIC TRANSFORMATION OVER NUMERICAL DATA\033[0m')
    print('\n')

    # Variables that should not be log-transformed:
    not_log = [c for c in df_train.columns if c not in cont_vars]

    if log_transform:
        print('\033[1mTraining data:\033[0m')
        df_train = applying_log_transf(dataframe=df_train, not_log=not_log)

        print('\033[1mTest data:\033[0m')
        df_test = applying_log_transf(dataframe=df_test, not_log=not_log)
        print('\n')


    else:
        print('\033[1mNo transformation performed!\033[0m')
        print('\n')

    print('---------------------------------------------------------------------------------------------------------')
    print('\n')
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\033[1mAPPLYING STANDARD SCALE TRANSFORMATION OVER NUMERICAL DATA\033[0m')
    print('\n')

    # Inputs that should not be standardized:
    not_stand = [c for c in df_train.columns if c.replace('L#', '') not in cont_vars]

    if standardize:
        scaled_data = applying_standard_scale(training_data=df_train, not_stand=not_stand,
                                              test_data=df_test)
        df_train_scaled = scaled_data['training_data']
        df_test_scaled = scaled_data['test_data']

    else:
        df_train_scaled = df_train.copy()
        df_test_scaled = df_test.copy()

        print('\033[1mNo transformation performed!\033[0m')

    print('\n')
    print('---------------------------------------------------------------------------------------------------------')
    print('\n')
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\033[1mTREATING MISSING VALUES\033[0m')
    print('\n')

    print('\033[1mTreating missing values of training data...\033[0m')
    df_train_scaled = treating_missings(dataframe=df_train_scaled, cat_vars=cat_vars,
                                        drop_vars=drop_vars)

    print('\033[1mTreating missing values of test data...\033[0m')
    df_test_scaled = treating_missings(dataframe=df_test_scaled, cat_vars=cat_vars,
                                       drop_vars=drop_vars)

    print('\n')
    print('---------------------------------------------------------------------------------------------------------')
    print('\n')
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\033[1mTRANSFORMING CATEGORICAL FEATURES\033[0m')
    print('\n')
    
    transf_data = applying_one_hot(df_train_scaled, cat_vars, test_data=df_test_scaled)
    df_train_scaled = transf_data['training_data']
    df_test_scaled = transf_data['test_data']

    print(f'\033[1mShape of df_train_scaled:\033[0m {df_train_scaled.shape}.')
    print(f'\033[1mShape of df_test_scaled:\033[0m {df_test_scaled.shape}.')
    print('\n')
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\n')
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\033[1mFINAL ASSESSMENT OF MISSINGS AND CHECKING DATASETS CONSISTENCY\033[0m')
    print('\n')
    
    # Assessing missing values (training data):
    missings_detection(df_train_scaled, name=f'df_train_scaled')

    # Assessing missing values (test data):
    missings_detection(df_test_scaled, name=f'df_test_scaled')
    
    # Checking datasets structure:
    df_test_scaled = data_consistency(dataframe=df_train_scaled,
                                      test_data=df_test_scaled)['test_data']
    
    print('---------------------------------------------------------------------------------------------------------')
    print('\n')
    
    return df_train, df_test, df_train_scaled, df_test_scaled