## Old Code

In [None]:
"""Copyright © by Boston Consulting Group. All rights reserved."""
import datetime
import logging
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import ticker
from sklearn import metrics
from tqdm import tqdm
from sklearn.metrics import auc, roc_curve

# from wex.checks import pd_check_cols_in_df

_logger = logging.getLogger(__name__)


def apply_woe_transformation(
    df: pd.DataFrame, var: str, mapper: pd.DataFrame
) -> pd.DataFrame:
    """
    Apply weight of evidence transformation
    :param df: Input data frame
    :param var: Column name of variable to be transformed
    :param mapper: Mapping table for transformation
    :return: Original data frame with additional column
    """
    var_woe = f"{var}_woe"
    mapper = mapper.rename(columns={"woe": var_woe})

    if (not pd.api.types.is_numeric_dtype(df[var])) or ("median" not in mapper):
        mapper.rename(columns={"woe": var_woe}, inplace=True)
        df = df.merge(mapper, how="left", left_on=var, right_on="binned")
    else:
        var_median = "median"
        var_median_left = "median_left"
        var_median_right = "median_right"
        var_woe_left = "woe_left"
        var_woe_right = "woe_right"

        # Transform variable
        df = df.merge(
            mapper[
                [
                    "binned",
                    var_median,
                    var_median_left,
                    var_median_right,
                    var_woe,
                    var_woe_left,
                    var_woe_right,
                ]
            ],
            how="left",
            on="binned",
        )
        df.loc[
            (df[var] < df[var_median])
            & np.isfinite(
                df[[var, var_median, var_median_left, var_woe, var_woe_left]]
            ).min(axis=1),
            var_woe,
        ] = (
            (df[var] - df[var_median_left]) * df[var_woe]
            + (df[var_median] - df[var]) * df[var_woe_left]
        ) / (
            df[var_median] - df[var_median_left]
        )
        df.loc[
            (df[var] > df[var_median])
            & np.isfinite(
                df[[var, var_median, var_median_right, var_woe, var_woe_right]]
            ).min(axis=1),
            var_woe,
        ] = (
            (df[var_median_right] - df[var]) * df[var_woe]
            + (df[var] - df[var_median]) * df[var_woe_right]
        ) / (
            df[var_median_right] - df[var_median]
        )
        df.drop(
            columns=[
                "binned",
                var_median,
                var_median_left,
                var_median_right,
                var_woe_left,
                var_woe_right,
            ],
            inplace=True,
        )

    return df


def create_woe_mapping_table(
    df: pd.DataFrame, n_bins: int, var_median: str, var_woe: str
) -> pd.DataFrame:
    """
    Create mapping table for WOE transformation based on binning results
    :param n_bins: Number of bins
    :param df: Binning results
    :param var_median: Column name in res for the median of var
    :param var_woe: Column name in res for the WOE of var
    :return: Mapping table
    """
    mapper = (
        df[["binned", var_median, var_woe]]
        .rename(columns={var_median: "median", var_woe: "woe"})
        .sort_values("binned")
    )

    mapper["woe_left"] = mapper["woe"].shift(1)
    mapper["woe_right"] = mapper["woe"].shift(-1)
    mapper["median_left"] = mapper["median"].shift(1)
    mapper["median_right"] = mapper["median"].shift(-1)

    # Add replacement values for empty bins
    for b in list(range(0, n_bins)) + list(range(n_bins - 1, -1, -1)):
        if sum(mapper.binned == b) == 0:
            left = np.nan
            right = np.nan
            if (len(mapper.loc[mapper.binned == b - 1]) == 1) & (b > 0):
                left = map.loc[map.binned == b - 1, "woe"].values[0]
            if (len(mapper.loc[mapper.binned == b + 1]) == 1) & (b < n_bins - 1):
                right = map.loc[map.binned == b + 1, "woe"].values[0]
            val = np.nanmean([left, right])
            if not np.isnan(val):
                tmp = pd.DataFrame(
                    {
                        "binned": [b],
                        "median": [np.nan],
                        "median_left": [np.nan],
                        "median_right": [np.nan],
                        "woe": [val],
                        "woe_left": [np.nan],
                        "woe_right": [np.nan],
                    }
                )
                mapper = mapper.append(tmp)

    # Check that the table include replacements for nan and inf
    if sum(mapper.binned == -1) == 0:
        val = mapper.loc[mapper.binned == 0, "woe"].values[0]
        tmp = pd.DataFrame(
            {
                "binned": [-1],
                "median": [-np.inf],
                "median_left": [np.nan],
                "median_right": [np.nan],
                "woe": [val],
                "woe_left": [np.nan],
                "woe_right": [np.nan],
            }
        )
        mapper = mapper.append(tmp)
    if sum(mapper.binned == n_bins) == 0:
        val = mapper.loc[mapper.binned == (n_bins - 1), "woe"].values[0]
        tmp = pd.DataFrame(
            {
                "binned": [n_bins],
                "median": [np.inf],
                "median_left": [np.nan],
                "median_right": [np.nan],
                "woe": [val],
                "woe_left": [np.nan],
                "woe_right": [np.nan],
            }
        )
        mapper = mapper.append(tmp)
    if sum(mapper.binned == n_bins + 1) == 0:
        tmp = pd.DataFrame(
            {
                "binned": [n_bins + 1],
                "median": [np.nan],
                "median_left": [np.nan],
                "median_right": [np.nan],
                "woe": [0.0],
                "woe_left": [np.nan],
                "woe_right": [np.nan],
            }
        )
        mapper = mapper.append(tmp)

    return mapper


def create_woe_transformation(
    df: pd.DataFrame,
    target: str,
    var: str,
    n_bins: int = 10,
    btype: str = "number",
    bins: np.ndarray = None,
    min_var: Any = None,
    max_var: Any = None,
) -> Tuple[np.ndarray, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Create weight of evidence plots
    :param df: Input data frame
    :param target: Column name of target variable
    :param var: Column name to calculate WOE/IV for
    :param n_bins: Number of bins
    :param btype: Method for creating bins
    :param bins: Predefined bins
    :param min_var: Lower bound of bin thresholds
    :param max_var: Upper bound of bin thresholds
    :return:
    """
    _logger.info(f"Analyzing variable {var}...")

    df = df[[target, var]].copy()
    n_bad = df[target].sum()
    n_obs = df[target].count()
    n_miss = df[var].isna().sum()

    if bins is not None:
        bins = pd.unique(bins)
        n_bins = len(bins) + 1

    if df[var].count() == 0:
        # All rows missing
        power = pd.DataFrame(
            {"name": [var], "auc": [0.5], "auc_woe": [0.5], "miss_pc": [100], "iv": [0]}
        )
        _logger.info(f"Variable {var} has no valid values.")
        return None, None, power, None

    if (not pd.api.types.is_numeric_dtype(df[var])) or (df[var].nunique() < n_bins):
        # Calculate results for categorical variables
        res = (
            df.groupby(var, dropna=False)
            .agg({target: ["count", "mean", "sum"]})
            .reset_index()
        )
        level_one = res.columns.get_level_values(0).astype(str)
        level_two = res.columns.get_level_values(1).astype(str)
        column_separator = ["_" if x != "" else "" for x in level_two]
        res.columns = level_one + column_separator + level_two

        var_woem = f"{var}_woem"
        res[f"{target}_mean"] = np.clip(res[f"{target}_mean"], 1e-4, 1 - 1e-4)
        res[var_woem] = np.log((1.0 / res[f"{target}_mean"]) - 1.0) + np.log(
            n_bad / (n_obs - n_bad)
        )
        res["iv"] = (
            (res[f"{target}_count"] - res[f"{target}_sum"]) / (n_obs - n_bad)
            - (res[f"{target}_sum"] / n_bad)
        ) * res[var_woem]
        iv = res["iv"].sum()
        res.rename(columns={var: "binned"}, inplace=True)

        mapper = res.copy()
        mapper.rename(columns={var_woem: "woe"}, inplace=True)
        mapper.drop(
            columns=[f"{target}_count", f"{target}_mean", f"{target}_sum", "iv"],
            inplace=True,
        )

        if len(mapper.loc[mapper.binned.isnull()]) == 0:
            mapper = mapper.append(pd.DataFrame({"binned": [np.nan], "woe": [0.0]}))
    else:
        # Calculate results for continuous variables
        if bins is None:
            if btype == "distance":
                # Replace outliers
                if min_var is None:
                    min_var = df.loc[
                        ~df[var].isin([-np.inf, np.inf, np.nan]), var
                    ].quantile(0.1)
                if max_var is None:
                    max_var = df.loc[
                        ~df[var].isin([-np.inf, np.inf, np.nan]), var
                    ].quantile(0.9)
                bins = np.linspace(min_var, max_var, n_bins - 1)
                bins = pd.unique(bins)
                n_bins = len(bins) + 1
            else:
                percentile = np.linspace(1.0 / n_bins, 1.0 - 1.0 / n_bins, n_bins - 1)
                bins = df.loc[~df[var].isin([-np.inf, np.inf, np.nan]), var].quantile(
                    percentile
                )
                bins = bins.unique()
                n_bins = len(bins) + 1

        df["binned"] = np.digitize(df[[var]], bins)
        df["binned"] = np.where(df[var] == -np.inf, -1, df["binned"])
        df["binned"] = np.where(df[var] == np.inf, n_bins, df["binned"])
        df["binned"] = np.where(df[var].isna(), n_bins + 1, df["binned"])
        res = (
            df.groupby("binned")
            .agg({var: ["mean", "median"], target: ["count", "mean", "sum"]})
            .reset_index()
        )
        level_one = res.columns.get_level_values(0).astype(str)
        level_two = res.columns.get_level_values(1).astype(str)
        column_separator = ["_" if x != "" else "" for x in level_two]
        res.columns = level_one + column_separator + level_two

        var_median = f"{var}_median"
        var_woem = f"{var}_woem"
        res[f"{target}_mean"] = np.clip(res[f"{target}_mean"], 1e-4, 1 - 1e-4)
        res[var_woem] = np.log((1.0 / res[f"{target}_mean"]) - 1.0) + np.log(
            n_bad / (n_obs - n_bad)
        )
        res["iv"] = (
            (res[f"{target}_count"] - res[f"{target}_sum"]) / (n_obs - n_bad)
            - (res[f"{target}_sum"] / n_bad)
        ) * res[var_woem]
        iv = res["iv"].sum()

        # Create mapping table for woe transformation
        mapper = create_woe_mapping_table(res, n_bins, var_median, var_woem)

    # Apply woe transformation
    df = apply_woe_transformation(df, var, mapper)
    df.sort_values(by=var, inplace=True)

    # Calculate AUCs for WOE transformed and original variable
    fpr_woe, tpr_woe, _ = metrics.roc_curve(df[target], -df[f"{var}_woe"], pos_label=1)
    auc_woe = metrics.auc(fpr_woe, tpr_woe)
    if pd.api.types.is_numeric_dtype(df[var]):
        tmp = df.loc[~df[var].isin([-np.inf, np.inf, np.nan])]
        fpr, tpr, _ = metrics.roc_curve(tmp[target], -tmp[var], pos_label=1)
        auc = metrics.auc(fpr, tpr)
    else:
        auc = np.nan

    # Plot default rate and number of observations for each bin
    _, (al, ar) = plt.subplots(
        ncols=2, figsize=(16, 4), gridspec_kw={"width_ratios": [1, 1]}
    )

    if (pd.api.types.is_numeric_dtype(df[var])) and (df[var].nunique() >= len(mapper)):
        # Group in many bins to plot woe, only for numeric variables
        n_plot_bins = 25
        percentiles = np.linspace(1.0 / n_plot_bins, 1.0, n_plot_bins)
        plot_bins = df.loc[~df[var].isin([-np.inf, np.inf, np.nan]), var].quantile(
            percentiles
        )
        plot_bins.iloc[n_plot_bins - 1] = plot_bins.iloc[n_plot_bins - 1] * (1 + 1e-6)

        df["plotbin"] = np.digitize(df[[var]], plot_bins)
        df["plotbin"] = np.where(df[var] == -np.inf, -1, df["plotbin"])
        df["plotbin"] = np.where(df[var] == np.inf, n_plot_bins, df["plotbin"])
        df["plotbin"] = np.where(df[var].isna(), n_plot_bins + 1, df["plotbin"])
        plotres = (
            df.groupby("plotbin")
            .agg({var: ["median"], f"{var}_woe": ["median"]})
            .reset_index()
        )
        level_one = plotres.columns.get_level_values(0).astype(str)
        level_two = plotres.columns.get_level_values(1).astype(str)
        column_separator = ["_" if x != "" else "" for x in level_two]
        plotres.columns = level_one + column_separator + level_two

        xx = res["binned"]
        xx_r = plotres["plotbin"]
        yy = plotres[f"{var}_woe_median"]
        labels = res[var_median].map("{:,.2f}".format)
        labels_r = plotres[var_median].map("{:,.2f}".format)
    else:
        yy = res[var_woem]
        xx = np.arange(res.index.size)
        xx_r = xx
        labels = res["binned"].astype(str).str[:12]
        labels_r = labels

    al.set_xlabel(var)
    al.set_ylabel("Clients")
    al.set_xticks(xx)
    al.set_xticklabels(labels, rotation="vertical")
    al.bar(xx, res[f"{target}_count"])
    al2 = al.twinx()
    al2.set_ylabel("Bad rate")
    plt.yscale("log")
    al2.plot(
        np.arange(res.index.size),
        res[f"{target}_mean"],
        color="darkorange",
        label="bad_rate",
    )
    al2.legend(loc=0)
    plt.subplots_adjust(bottom=0.2, wspace=0.3)

    ar.set_xlabel(var)
    ar.set_ylabel("Weight of evidence")
    ar.set_xticks(xx_r)
    ar.set_xticklabels(labels_r, rotation="vertical")
    ar.plot(xx_r, yy, color="darkorange", label="woe")
    ar.legend(loc=0)
    plt.subplots_adjust(bottom=0.2, wspace=0.3)
    plt.show()

    # Plot ROC curves
    fig, ax = plt.subplots()
    ax.plot(
        fpr_woe,
        tpr_woe,
        color="darkorange",
        lw=2,
        label=f"{var}_woe (area = {auc_woe:.3f})",
    )
    if pd.api.types.is_numeric_dtype(df[var]):
        ax.plot(fpr, tpr, color="navy", lw=2, label=f"{var} (area = {auc:.3f})")
    ax.plot([0, 1], [0, 1], color="lightgrey", lw=1, linestyle="--")
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel("Cumulated goods")
    ax.set_ylabel("Cumulated bads")
    ax.set_title("ROC curve")
    ax.legend(loc="lower right")
    plt.show()

    _logger.info(f"Information value: {iv}")
    _logger.info(f"AUC (WOE): {auc_woe}")
    _logger.info(f"AUC: {auc}")
    if pd.api.types.is_numeric_dtype(df[var]):
        quantiles = df[var].quantile([0, 0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99, 1])
        _logger.info(f"Quantiles:\n{quantiles}")

    power = pd.DataFrame(
        {
            "name": [var],
            "auc": [auc],
            "auc_woe": [auc_woe],
            "miss_pc": [100 * n_miss / n_obs],
            "iv": [iv],
        }
    )

    return bins, mapper, power, res


def create_binary_target_plots(
    df: pd.DataFrame,
    target_col: str,
    feature_col: str,
    n_bins: int = 10,
    min_feature_val: float = None,
    max_feature_val: float = None,
    min_feature_q: float = None,
    max_feature_q: float = None,
    create_plot=True,
    invert_xaxis=False,
    use_quantiles: bool = True,
    fillna: bool = False
) -> Tuple[Union[pd.DataFrame, None], Union[plt.figure, None]]:
    """Creates three plots for univariate analysis for binary targets.
    :param df: Input data frame
    :param target_col: Column name of target variable
    :param feature_col: List of column names contained in df.
    :param n_bins: Number of bins
    :param min_feature_val: Lower feature value at which to
        winsorize (only applied to numerical columns).
    :param max_feature_val Upper feature value at which to
        winsorize (only applied to numerical columns).
    :param min_feature_q: Lower quantile at which to
        winsorize (only applied to numerical columns).
    :param max_feature_q: Upper quantile at which to
        winsorize (only applied to numerical columns).
    :param create_plot: Determines whether figure will be plotted.
    :param use_quantiles: Whether to use quantiles for event rate bins.
    :return: -> Dataframe containing feature and target metrics and figure:
    """

    # Ensure columns exist
    # pd_check_cols_in_df(df, col_list=[feature_col, target_col], raise_flag=True)

    if df[feature_col].nunique() < 2:
        _logger.info("Feature contains less than 2 features!")
        return None, None

    # Ensure target contains no NA values
    if df[target_col].isna().sum() > 0:
        raise ValueError("Target column contains NA values!")

    # Ensure target is binary
    if df[target_col].nunique() != 2:
        raise ValueError("Target variable is not binary!")

    # Ensure min/max values are in correct order
    if (
        min_feature_val is not None
        and max_feature_val is not None
        and min_feature_val >= max_feature_val
    ):
        raise ValueError(
            "min_feature_value must be strictly smaller than max_feature_val!"
        )
    elif (
        min_feature_q is not None
        and max_feature_q is not None
        and min_feature_q >= max_feature_q
    ):
        raise ValueError("min_feature_q must be strictly smaller than max_feature_q!")
    elif min_feature_q is not None and min_feature_q < 0:
        raise ValueError("Quantiles must be values between 0 and 1!!")
    elif max_feature_q is not None and max_feature_q < 0:
        raise ValueError("Quantiles must be values between 0 and 1!!")

    # Create fresh copy of dataframe
    df = df.copy()
    
    #if fillna == True:
    #        df = df.copy()
    #        df.loc[df[feature_col].isin([-np.inf, np.inf, np.nan]), feature_col] = df.loc[~df[feature_col].isin([-np.inf, np.inf, np.nan]), feature_col].median()

    # Calculate global event rate
    event_rate = np.mean(df[target_col])

    # Case 1: At most n_bins elements
    if df[feature_col].nunique() < n_bins:

        # Extract unique values
        unique_vals = df[feature_col].sort_values().unique()

        # Convert sorted elements to int
        mapping = pd.factorize(unique_vals, na_sentinel=len(unique_vals))

        # Create mapping
        mapping_values = list(mapping[0])
        mapping_keys = list(mapping[1])

        # Create mapping dictionary
        mapping_dict = dict(zip(mapping_keys, mapping_values))

        # Apply mapping to column
        df = df.assign(bins=df[feature_col].map(mapping_dict))

        # Set labels for plotting
        labels = mapping_keys

    # Case 2: More than n_bins elements and categorical
    elif (not pd.api.types.is_numeric_dtype(df[feature_col])) and (
        df[feature_col].nunique() >= n_bins
    ):
        _logger.info(
            f"--- Number of unique elements ({df[feature_col].nunique()} "
            f"exceeds n_bins ({n_bins})). "
            f"Increase n_bins to create a plot!"
        )
        return None, None

    # Case 3: More than n_bins elements and numerical
    elif pd.api.types.is_numeric_dtype(df[feature_col]):

        # Extract number of unique values
        n_unique_feat_vals = df[feature_col].nunique()

        # Adjust n_bins is less unique values exist
        n_bins = np.minimum(n_unique_feat_vals, n_bins)

        min_val = np.nanmin(df[feature_col])
        max_val = np.nanmax(df[feature_col])

        if min_feature_q is not None:
            min_val = np.nanquantile(df[feature_col], min_feature_q)

        if max_feature_q is not None:
            max_val = np.nanquantile(df[feature_col], max_feature_q)

        if min_feature_val is not None:
            min_val = np.maximum(min_val, min_feature_val)

        if max_feature_val is not None:
            max_val = np.minimum(max_val, max_feature_val)

        idx_neg_inf = df[feature_col] == -np.inf
        idx_pos_inf = df[feature_col] == np.inf
        min_val_excl_inf = np.nanmin(df[feature_col][~idx_neg_inf])
        max_val_excl_inf = np.nanmax(df[feature_col][~idx_pos_inf])

        min_val_adj = np.maximum(min_val, min_val_excl_inf)
        max_val_adj = np.minimum(max_val, max_val_excl_inf)

        # Create equidistant grid
        if use_quantiles:
            bins = np.unique(
                df[feature_col]
                .clip(min_val_adj, max_val_adj)
                .quantile(np.linspace(0, 1, n_bins))
            )
            n_bins = len(bins)
        else:
            bins = np.linspace(min_val_adj, max_val_adj, n_bins)

        bins[0] = min_val
        bins[n_bins - 1] = max_val

        # Clip values according to min/max values
        df[feature_col].clip(lower=min_val, upper=max_val, inplace=True)

        # Ensure clipping values dooes not remove all but a single value
        if df[feature_col].nunique() < 2:
            _logger.info(
                "Feature contains less than 2 features after clipping outliers!!"
            )
            return None, None

        # Create bins (return None if binning is not successfull)
        try:
            df = df.assign(
                bins=pd.cut(
                    x=df.loc[:, feature_col],
                    bins=bins,
                    include_lowest=True,
                    right=True,
                    labels=False,
                )
            )
        except Exception as e:
            _logger.warning(e)
            return None, None

        # Create plot labels
        bins = list(bins)
        labels = [
            f'({"{:,.2f}".format(bins[i])}, {"{:,.2f}".format(bins[i+1])}]'
            for i in range(n_bins - 1)
        ]

        # Handle NAs
        if df["bins"].isna().sum() > 0:
            df.loc[:, "bins"] = df.loc[:, "bins"].where(
                ~df.loc[:, "bins"].isna(), n_bins - 1
            )
            labels.append("NA")
            n_bins += 1

        # Convert bins to categories
        df.bins = df.bins.astype("category")

        # Set all categories
        df.bins = df.bins.cat.set_categories(list(range(n_bins - 1)))

    # Group into bins and calculate required metrics
    df_binned = df.groupby("bins").agg({feature_col: [len], target_col: ["mean"]})

    # Rename columns
    level_one = df_binned.columns.get_level_values(0).astype(str)
    level_two = df_binned.columns.get_level_values(1).astype(str)
    column_separator = ["_" if x != "" else "" for x in level_two]
    df_binned.columns = level_one + column_separator + level_two

    # Set NA counts to zero
    df_binned[f"{feature_col}_len"] = df_binned[f"{feature_col}_len"].fillna(0)

    # Add lift rate
    df_binned.loc[:, "lift_rate"] = np.divide(
        df_binned[f"{target_col}_mean"], event_rate
    )

    # Build plots dependent on feature type
    if not pd.api.types.is_numeric_dtype(df[feature_col]):

        grid = plt.GridSpec(1, 6, wspace=1.2, hspace=0.2)
        fig_grid = plt.figure(figsize=(18, 12))
        fig_grid.suptitle(f"{feature_col}: Event Rates", fontsize=20)

    else:

        grid = plt.GridSpec(2, 6, wspace=1.2, hspace=0.2, height_ratios=[2, 2])
        fig_grid = plt.figure(figsize=(14, 9))
        fig_grid.suptitle(
            f"{feature_col}: Roc Curve | Densities | Event Rates", fontsize=20
        )
        
        if fillna == True:
            df_imputed = df.copy()
            df_imputed.loc[df_imputed[feature_col].isin([-np.inf, np.inf, np.nan]), feature_col] = df_imputed.loc[~df_imputed[feature_col].isin([-np.inf, np.inf, np.nan]), feature_col].median()
            
        else:
            df_imputed = df.loc[~df[feature_col].isin([-np.inf, np.inf, np.nan])]
        fpr, tpr, _ = metrics.roc_curve(
            df_imputed[target_col], -df_imputed[feature_col], pos_label=1
        )
        auc = metrics.auc(fpr, tpr)
        if auc < 0.5:
            fpr, tpr, _ = metrics.roc_curve(
                df_imputed[target_col], df_imputed[feature_col], pos_label=1
            )
            auc = metrics.auc(fpr, tpr)

        # Build Roc-Curve plot
        upper_ax_left = fig_grid.add_subplot(grid[0, 0:3])
        if pd.api.types.is_numeric_dtype(df[feature_col]):
            upper_ax_left.plot(
                fpr, tpr, lw=2, label=f"{feature_col} (area = {auc:.3f})"
            )

        upper_ax_left.plot([0, 1], [0, 1], color="lightgrey", lw=1, linestyle="--")
        upper_ax_left.set_xlim([0.0, 1.0])
        upper_ax_left.set_ylim([0.0, 1.05])
        upper_ax_left.set_xlabel("Cumulated goods")
        upper_ax_left.set_ylabel("Cumulated bads")
        upper_ax_left.legend(loc="lower right")

        # Build density plot
        upper_ax_right = fig_grid.add_subplot(grid[0, 3:6])

        sns.kdeplot(
            data=df,
            x=feature_col,
            hue=target_col,
            log_scale=False,
            fill=True,
            cumulative=False,
            common_norm=False,
            ax=upper_ax_right,
            clip=(df[feature_col].quantile(0.01), df[feature_col].quantile(0.99))
        )
        if all(df[feature_col] > 0):
            upper_ax_right.set_xscale("log")

    # Build event rate plot
    if pd.api.types.is_numeric_dtype(df[feature_col]):
        lower_ax = fig_grid.add_subplot(grid[1, :])
    else:
        lower_ax = fig_grid.add_subplot(grid[0, :])

    # Add ticks & laels to axis
    lower_ax.set_xlabel(feature_col)
    lower_ax.set_ylabel("# Observations")
    lower_ax.set_xticks(df_binned.index)
    lower_ax.set_xticklabels(labels, rotation=90)

    # Plot barplot containing number of observations
    lower_ax.bar(df_binned.index, df_binned[f"{feature_col}_len"], color="lightgray")

    lower_ax.yaxis.grid(False)

    # Mirror plot and add event rates
    lower_ax_2 = lower_ax.twinx()
    lower_ax_2.set_ylabel("Event rate")
    lower_ax_2.set_ylim(ymin=0, ymax=df_binned[f"{target_col}_mean"].max() + 0.1)
    plt.yticks(np.arange(0, df_binned[f"{target_col}_mean"].max() + 0.1, step=0.1))
    lower_ax_2.plot(
        df_binned.index, df_binned[f"{target_col}_mean"], label="event_rate", marker="o"
    )

    # Add global event rate as baseline
    lower_ax_2.plot(
        [min(df_binned.index) - 1, max(df_binned.index) + 1],
        [event_rate, event_rate],
        color="darkgrey",
        lw=1,
        linestyle="--",
        label=f"total_event_rate\n({'{:.1%}'.format(event_rate)})",
    )
    lower_ax_2.legend(loc=0)
    lower_ax_2.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1, decimals=0))

    lower_ax_2.yaxis.grid(False)
    lower_ax_2.set_xlim([min(df_binned.index) - 0.5, max(df_binned.index) + 0.5])

    if invert_xaxis:
        lower_ax_2.invert_xaxis()

    if create_plot:
        plt.show()

    return df_binned, fig_grid


def create_save_binary_target_plots(
    df: pd.DataFrame,
    target_col: str,
    feature_col_list: Union[None, List[str]],
    path: Union[Path, str],
    n_bins: int = 10,
    min_feature_q: Union[None, float] = None,
    max_feature_q: Union[None, float] = None,
    use_quantiles: bool = True,
) -> None:
    """A wrapper to create binary target plots and save to disk for multiple features.
    :param df: Input data frame
    :param target_col: Column name of target variable
    :param feature_col_list: List of feature names to consider.
    :param path: The path where plots should be saved.
    :param n_bins: Number of bins
    :param min_feature_q: Lower quantile at which to
        winsorize (only applied to numerical columns).
    :param max_feature_q: Upper quantile at which to
        winsorize (only applied to numerical columns).
    :param use_quantiles: Whether to use quantiles for event rate bins.
    :return:
    """

    # Use all columns if no feature list is provided
    if feature_col_list is None:
        feature_col_list = [col for col in df.columns if col != target_col]

    # Ensure all columns are present in data
    # pd_check_cols_in_df(df=df, col_list=feature_col_list, raise_flag=True)

    # Loop through feature list and save figure
    for i in tqdm(range(len(feature_col_list))):

        # Extract feature name
        feature_name = feature_col_list[i]

        # Create filename
        fpath = path.joinpath(f"{feature_name}.png")

        # Create output
        df_binned, fig_grid = create_binary_target_plots(
            df=df,
            target_col=target_col,
            feature_col=feature_name,
            n_bins=n_bins,
            min_feature_val=None,
            max_feature_val=None,
            min_feature_q=min_feature_q,
            max_feature_q=max_feature_q,
            create_plot=False,
            use_quantiles=use_quantiles,
        )

        if fig_grid is not None:
            fig_grid.savefig(fname=fpath, bbox_inches="tight")


def create_univariate_feature_target_stats(
    df: pd.DataFrame,
    target_col: str,
    feature_col_list: Union[None, List[str]],
    path: Union[Path, str],
    folder_suffix=None,
    n_bins: int = 10,
    n_elem_max: int = 20,
    min_feature_q: Union[None, float] = None,
    max_feature_q: Union[None, float] = None,
    use_quantiles: bool = True,
) -> None:
    """Saves binary feature target plots and metrics for a given sample.
    :param df: Input data frame
    :param target_col: Column name of target variable
    :param feature_col_list: List of feature names to consider.
    :param path: The path where plots should be saved.
    :param n_bins: Number of bins
    :param n_elem_max: Maximum number of elements for non-numeric columns.
    :param min_feature_q: Lower quantile at which to
        winsorize (only applied to numerical columns).
    :param max_feature_q: Upper quantile at which to
        winsorize (only applied to numerical columns).
    :param use_quantiles: Whether to use quantiles for event rate bins.
    :return: None
    """

    if feature_col_list is None:
        feature_col_list = df.columns.tolist()
    else:
        feature_col_list = feature_col_list + [target_col]

    # Ensure columns are present
    # _ = pd_check_cols_in_df(df=df, col_list=feature_col_list, raise_flag=True)

    # Select column subset
    df = df[feature_col_list]

    # Create folder name
    folder_name = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    if folder_suffix is not None:
        folder_name = f"{folder_name}_{folder_suffix}"

    # Create output path
    output_path = Path(path).joinpath(folder_name)

    # Creat folder
    output_path.mkdir(parents=True, exist_ok=True)
    _logger.info(f"--- Saving output to: {output_path} ---")

    # Create subfolder
    output_path_plot = output_path.joinpath("plots")
    output_path_plot.mkdir(parents=True, exist_ok=True)
    output_path_metrics = output_path.joinpath("metrics")
    output_path_metrics.mkdir(parents=True, exist_ok=True)

    # Build and save plots
    _logger.info("--- Create binary target plots ---")
    create_save_binary_target_plots(
        df=df,
        target_col=target_col,
        feature_col_list=None,
        n_bins=n_bins,
        min_feature_q=min_feature_q,
        max_feature_q=max_feature_q,
        path=output_path_plot,
        use_quantiles=use_quantiles,
    )


def compute_univariate_aucs(
    sample: pd.DataFrame, 
    targets: List, 
    features_per_module: Dict, 
    feature_to_flag: Dict
) -> pd.DataFrame:
    """Saves binary feature target plots and metrics for a given sample.
    :param sample: Input data frame
    :param targets: List of target column names
    :param features_per_module: Dictionary of feature by module.
    :param feature_to_flag: Dictionary of module flags.
    :return: DataFrame
    """
    stat_df = pd.DataFrame()

    all_features = features_per_module['account'] + features_per_module['transaction'] + features_per_module['delinquency']
    

    for f in all_features:
        
        flag = feature_to_flag[f]
        
        
        flagged_df = sample.copy() #sample[sample[flag].eq(1.0)].copy()

        missing = flagged_df[f].isin([-np.inf, np.inf, np.nan]).mean()
        sub_df_dropna = flagged_df.loc[~flagged_df[f].isin([-np.inf, np.inf, np.nan])]

        sub_df_fillna = flagged_df
        sub_df_fillna.loc[sub_df_fillna[f].isin([-np.inf, np.inf, np.nan]), f] = sub_df_fillna.loc[~sub_df_fillna[f].isin([-np.inf, np.inf, np.nan]), f].median()

        aucs_dropna = []
        aucs_fillna = []
        for t in targets:
            fpr, tpr, _ = roc_curve(sub_df_dropna[t], sub_df_dropna[f], pos_label=1)
            aucs_dropna.append(auc(fpr, tpr))

            fpr, tpr, _ = roc_curve(sub_df_fillna[t], sub_df_fillna[f], pos_label=1)
            aucs_fillna.append(auc(fpr, tpr))
        
        app_df = pd.DataFrame(columns=['feature', 'flag', 'missing'] + ['auc_' + c + '_dropna' for c in targets]+ ['auc_' + c + '_fillna' for c in targets], 
            data=np.array([f, flag, missing] + aucs_dropna + aucs_fillna).reshape(1, -1))
        stat_df = pd.concat([stat_df, app_df], axis=0)

    for t in targets:
        stat_df['auc_' + t  + '_dropna'] = stat_df['auc_' + t  + '_dropna'].astype(float)
        stat_df['absolute_auc_' + t  + '_dropna'] = 0.5 + (stat_df['auc_' + t + '_dropna'] - 0.5).abs()

        stat_df['auc_' + t  + '_fillna'] = stat_df['auc_' + t  + '_fillna'].astype(float)
        stat_df['absolute_auc_' + t  + '_fillna'] = 0.5 + (stat_df['auc_' + t + '_fillna'] - 0.5).abs()

    return stat_df.reset_index(drop=True)