#### 1. Import libraries

In [None]:
###Importing libraries
import math
import numpy
import pandas
import matplotlib.pyplot
import matplotlib.ticker as mtick
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

#### 2. Functions

In [None]:
def uncouping_x_y_clf(data, target_name):
    y = data[target_name].values
    classes = numpy.unique(y)
    data.drop([target_name], axis=1, inplace=True, errors='ignore')
    return data, numpy.where(y == classes[0], -1, 1)

def uncouping_x_y_reg(data, target_name):
    y = data[target_name].values
    data.drop([target_name], axis=1, inplace=True, errors='ignore')
    return data, y

def VIF(X_train, print_flag=False):
    def compute_VIF(X_train, print_flag=False):
        # VIF dataframe
        vif_data = pandas.DataFrame()
        vif_data["feature"] = X_train.columns

        # calculating VIF for each feature
        vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(len(X_train.columns))]
        if print_flag:
            print(vif_data)
        return vif_data

    del_vars = []
    vif_data = compute_VIF(X_train, print_flag=print_flag)
    del_var = vif_data[vif_data["VIF"]==max(vif_data["VIF"])]['feature'].values[0] if max(vif_data["VIF"]) >=10 else None
    while((del_var is not None) and (X_train.shape[1] > 2)):
        del_vars.append(del_var)
        X_train.drop([del_var], axis=1, inplace=True, errors='ignore')
        vif_data = compute_VIF(X_train, print_flag=print_flag)
        del_var = vif_data[vif_data["VIF"]==max(vif_data["VIF"])]['feature'].values[0] if max(vif_data["VIF"]) >=10 else None
    
    return del_vars

def reset_index_data(data_1=None, data_2=None, data_3=None, data_4=None):
    if data_2 is None:
        return data_1.reset_index(drop=True)
    elif data_3 is None: 
        return data_1.reset_index(drop=True), data_2.reset_index(drop=True)
    elif data_4 is None: 
        return data_1.reset_index(drop=True), data_2.reset_index(drop=True), data_3.reset_index(drop=True)
    else:
        return data_1.reset_index(drop=True), data_2.reset_index(drop=True), data_3.reset_index(drop=True), data_4.reset_index(drop=True)

In [None]:
def num_and_cat_features(dataset, print_var = False):
    category_types_list = []
    colNames = dataset.columns.values.tolist()
    for colName in colNames:
        if dataset[colName].dtypes == 'object' or dataset[colName].dtype.name == 'category':
            category_types_list.append(colName)

    numeric_types_list = pandas.Series(dataset.columns.drop(category_types_list, errors='ignore'))
    if print_var == True:
        print('Numeric variables: \n', [i for i in numeric_types_list])
        print('Category variables: \n', category_types_list)
    return numeric_types_list, category_types_list

def category_encoding(xtrain=None, ytrain=None, data_1=None, data_2=None, data_3=None):
    numeric_vars, category_vars = num_and_cat_features(xtrain, print_var = False)
    if len(category_vars) != 0:
        cat_xtrain = xtrain[category_vars]
        enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first').fit(cat_xtrain)
        # Create a DataFrame with the encoded columns
        one_hot_df = pandas.DataFrame(enc.transform(cat_xtrain), columns=enc.get_feature_names_out(category_vars))
        xtrain = pandas.concat([xtrain[numeric_vars], one_hot_df], axis=1) # all are become numeric_vars

        if data_1 is None:
            return xtrain
        elif data_2 is None:
            one_hot_df = pandas.DataFrame(enc.transform(data_1[category_vars]), columns=enc.get_feature_names_out(category_vars))
            data_1 = pandas.concat([data_1[numeric_vars], one_hot_df], axis=1)
            return xtrain, data_1
        elif data_3 is None:
            one_hot_df = pandas.DataFrame(enc.transform(data_1[category_vars]), columns=enc.get_feature_names_out(category_vars))
            data_1 = pandas.concat([data_1[numeric_vars], one_hot_df], axis=1)

            one_hot_df = pandas.DataFrame(enc.transform(data_2[category_vars]), columns=enc.get_feature_names_out(category_vars))
            data_2 = pandas.concat([data_2[numeric_vars], one_hot_df], axis=1)
            return xtrain, data_1, data_2
        else:
            one_hot_df = pandas.DataFrame(enc.transform(data_1[category_vars]), columns=enc.get_feature_names_out(category_vars))
            data_1 = pandas.concat([data_1[numeric_vars], one_hot_df], axis=1)

            one_hot_df = pandas.DataFrame(enc.transform(data_2[category_vars]), columns=enc.get_feature_names_out(category_vars))
            data_2 = pandas.concat([data_2[numeric_vars], one_hot_df], axis=1)

            one_hot_df = pandas.DataFrame(enc.transform(data_3[category_vars]), columns=enc.get_feature_names_out(category_vars))
            data_3 = pandas.concat([data_3[numeric_vars], one_hot_df], axis=1)
            return xtrain, data_1, data_2, data_3
    else:
        if data_1 is None:
            return xtrain
        elif data_2 is None:
            return xtrain, data_1
        elif data_3 is None:
            return xtrain, data_1, data_2
        else:
            return xtrain, data_1, data_2, data_3
        
def data_scaling(xtrain=None, data_1=None, data_2=None, data_3=None):
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    
    if data_1 is None:
        return xtrain
    elif data_2 is None:
        return xtrain, scaler.transform(data_1)
    elif data_3 is None:
        return xtrain, scaler.transform(data_1), scaler.transform(data_2)
    else:
        return xtrain, scaler.transform(data_1), scaler.transform(data_2), scaler.transform(data_3)
    
def data_processing(xtrain=None, ytrain=None, xtest_1=None, xtest_2=None, xtest_3=None, check_multicollinearity=False):
    condition = (check_multicollinearity == True) and (xtrain.shape[1] > 1)
    
    if xtest_1 is None:
        xtrain_enc = category_encoding(xtrain=xtrain, ytrain=ytrain, data_1=None, data_2=None, data_3=None)
        if condition:
            del_vars = VIF(xtrain_enc, print_flag=False)
            xtrain_enc.drop(del_vars, axis=1, inplace=True, errors='ignore')

            print(f'Check multicollinearity, Training_n_samples = {xtrain_enc.shape}')

        xtrain_enc = data_scaling(xtrain=xtrain_enc, data_1=None, data_2=None, data_3=None)
        return xtrain_enc
        
    elif xtest_2 is None:
        xtrain_enc, xtest_enc_1 = category_encoding(xtrain=xtrain, ytrain=ytrain, data_1=xtest_1, data_2=None, data_3=None)
        if condition:
            del_vars = VIF(xtrain_enc, print_flag=False)
            xtrain_enc.drop(del_vars, axis=1, inplace=True, errors='ignore')
            xtest_enc_1.drop(del_vars, axis=1, inplace=True, errors='ignore')

            print(f'Check multicollinearity, Training_n_samples = {xtrain_enc.shape}, Testing_n_samples = {xtest_enc_1.shape}')

        xtrain_enc, xtest_enc_1 = data_scaling(xtrain=xtrain_enc, data_1=xtest_enc_1.copy(), data_2=None, data_3=None)
        return xtrain_enc, xtest_enc_1
        
    elif xtest_3 is None:
        xtrain_enc, xtest_enc_1, xtest_enc_2 = category_encoding(xtrain=xtrain, ytrain=ytrain, data_1=xtest_1, data_2=xtest_2, data_3=None)
        if condition:
            del_vars = VIF(xtrain_enc, print_flag=False)
            xtrain_enc.drop(del_vars, axis=1, inplace=True, errors='ignore')
            xtest_enc_1.drop(del_vars, axis=1, inplace=True, errors='ignore')
            xtest_enc_2.drop(del_vars, axis=1, inplace=True, errors='ignore')

            print(f'Check multicollinearity, Training_n_samples = {xtrain_enc.shape}, Validation_n_samples = {xtest_enc_1.shape}, Testing_n_samples = {xtest_enc_2.shape}')

        xtrain_enc, xtest_enc_1, xtest_enc_2 = data_scaling(xtrain=xtrain_enc, data_1=xtest_enc_1.copy(), data_2=xtest_enc_2.copy(), data_3=None)
        return xtrain_enc, xtest_enc_1, xtest_enc_2
        
    else:
        xtrain_enc, xtest_enc_1, xtest_enc_2, xtest_enc_3 = category_encoding(xtrain=xtrain, ytrain=ytrain, data_1=xtest_1, data_2=xtest_2, data_3=xtest_3)
        if condition:
            del_vars = VIF(xtrain_enc, print_flag=False)
            xtrain_enc.drop(del_vars, axis=1, inplace=True, errors='ignore')
            xtest_enc_1.drop(del_vars, axis=1, inplace=True, errors='ignore')
            xtest_enc_2.drop(del_vars, axis=1, inplace=True, errors='ignore')
            xtest_enc_3.drop(del_vars, axis=1, inplace=True, errors='ignore')

            print(f'Check multicollinearity, Training_n_samples = {xtrain_enc.shape}, Validation_n_samples = {xtest_enc_1.shape}, Testing_n_samples = {xtest_enc_2.shape}, Point_of_interest_n_samples = {xtest_enc_3.shape}')

        xtrain_enc, xtest_enc_1, xtest_enc_2, xtest_enc_3 = data_scaling(xtrain=xtrain_enc, data_1=xtest_enc_1.copy(), data_2=xtest_enc_2.copy(), data_3=xtest_enc_3.copy())
        return xtrain_enc, xtest_enc_1, xtest_enc_2, xtest_enc_3

#### 3. Data visualization 

In [4]:
def bar_plot(data, target_name):
    matplotlib.pyplot.figure(figsize=(10,5))
    ax = (data[target_name].value_counts()*100.0 /len(data[target_name])).plot(kind='bar', stacked = True, rot = 0)
    ax.yaxis.set_major_formatter(mtick.PercentFormatter())
    ax.set_ylabel('Frequency Percentage')
    ax.set_xlabel('Class')
    ax.set_title('Frequency Percentage by Class')
    matplotlib.pyplot.grid(True)

    totals = []  # list to collect the plt.patches data

    # values and append to list
    for i in ax.patches:
        totals.append(i.get_width())

    total = sum(totals)  # setting individual bar lables using above list

    for i in ax.patches:
        ax.text(i.get_x()+.15, i.get_height()-3.5, str(round((i.get_height()/total), 1))+'%', color='black', weight = 'bold')
    return matplotlib.pyplot

In [5]:
def data_visualisation(dataset, data_name, a):
    numeric_feature_names = []
    category_feature_names = []
    colNames = dataset.columns.values.tolist()
    for colName in colNames:
        if (dataset[colName].dtypes == 'int64' or dataset[colName].dtypes == 'float64'):
            numeric_feature_names.append(colName)
        else:
            category_feature_names.append(colName)
            
    if data_name =='ssq':
        category_feature_names = category_feature_names[0: len(category_feature_names)-2] #'fullkey_sub' & 'd_cdpo_sub' removed    
    
    if (a==1):
        # USING pie plots
        i=1
        print(); print("Les variables categorielles : ", category_feature_names)
        print(); print("Camembert de chacune des variables categorielles", len(category_feature_names))
        matplotlib.pyplot.figure(figsize=(20,55))
        for col in category_feature_names:
            matplotlib.pyplot.gcf().subplots_adjust(wspace = 0.5)
            matplotlib.pyplot.subplot(11,4,i)
            dataset.groupby(category_feature_names[i-1]).size().plot(kind='pie', autopct='%1.1f%%', textprops={'fontsize': 15})
            matplotlib.pyplot.title(category_feature_names[i-1], size=18)
            matplotlib.pyplot.ylabel(' ')
            i += 1
        matplotlib.pyplot.show()
    
    if (a==2):
        # BOX plots USING box and whisker plots
        i=1
        print(); print("Les variables numériques : ", numeric_feature_names)
        print(); print("Boîte à moustache de chacune des variables numériques", len(numeric_feature_names))
        matplotlib.pyplot.figure(figsize=(14,100))
        for col in numeric_feature_names:
            matplotlib.pyplot.gcf().subplots_adjust(wspace = 0.5, hspace = 0.4)
            matplotlib.pyplot.subplot(28,4,i)
            matplotlib.pyplot.axis('on')
            matplotlib.pyplot.tick_params(axis='both', left=True, top=False, right=False, bottom=True)
            matplotlib.pyplot.tick_params(axis="x", labelsize=12)
            dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
            i += 1
        matplotlib.pyplot.show()
    
    if (a==3):
        # USING histograms
        j=1
        print(); print("Histogramme de chacune des variables numériques", len(numeric_feature_names))
        matplotlib.pyplot.figure(figsize=(18,100))
        for col in numeric_feature_names:
            matplotlib.pyplot.gcf().subplots_adjust(wspace = 0.5, hspace = 0.4)
            matplotlib.pyplot.subplot(28,4,j)
            matplotlib.pyplot.axis('on')
            matplotlib.pyplot.tick_params(axis='both', left=True, top=False, right=False, bottom=True)
            dataset[col].hist()
            matplotlib.pyplot.title(numeric_feature_names[j-1], size=15)
            j += 1
        matplotlib.pyplot.show()
    
    if (a==4):
        # correlation matrix
        print(); print("Matrice de corrélation pour toutes les variables numériques", len(numeric_feature_names))
        fig = matplotlib.pyplot.figure(figsize=(10,8))
        ax = fig.add_subplot(111)
        cax = ax.matshow(dataset[numeric_feature_names].corr(), vmin=-1, vmax=1, interpolation='none')
        fig.colorbar(cax)
        matplotlib.pyplot.show()