In [1]:
"""Copyright © by Boston Consulting Group. All rights reserved."""
import datetime
import logging
import kaleido
from dataclasses import dataclass, field
import pathlib
import os
from typing import Any, Dict, List, Tuple, Union, Protocol

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import ticker
from sklearn import metrics
from tqdm import tqdm
from sklearn.metrics import auc, roc_curve
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.graph_objects as go

pd.set_option('display.max_columns', None)

import plotly.io as pio
pio.renderers.default = "iframe"

In [2]:
base_path = pathlib.Path().resolve().parent
data_path = base_path / 'data'
sample_data_loc = data_path / 'test_sample.csv'
df_sample = pd.read_csv(sample_data_loc)

In [3]:
def get_n_quantile_bins(
    df: pd.DataFrame,
    feature_col: str,
    min_val_adj: float,
    max_val_adj: float,
    n_bins: int
) -> np.ndarray:
    
    """ ensures we use the appropriate number of quantiles to get nbin bins
    
        Example:
            For a heavily skewed distribution, when using 10 quantiles, we might only get 2 values
            for example with [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1], all but the last quantile will be 0, and the only bin
            will be (0,1). 

        Args:
            df: pd.DataFrame,
            feature_col: str,
            min_val_adj: float,
            max_val_adj: float,
            n_bins: int
    
        Returns:
            Bins
    """
    
    # start count at number of bins
    i = n_bins
    
    bins = np.unique(
                df[feature_col]
                .clip(min_val_adj, max_val_adj)
                .quantile(np.linspace(0, 1, i))
            )
    
    # if after creating n_bin quantiles, we don't have n_bin bins, increase number of quantiles
    while(len(bins)) < n_bins:
        bins = np.unique(
                df[feature_col]
                .clip(min_val_adj, max_val_adj)
                .quantile(np.linspace(0, 1, i))
            )
        i += 1
    
    return bins

def remove_or_impute_nan_infs(df: pd.DataFrame, feature_col: str, target_col: str, fillna: bool = False):
    """
    remove nan, inf and -inf elements or impute w/ median
    """
    if fillna == True:
        df.loc[df[feature_col].isin([-np.inf, np.inf, np.nan]), feature_col] = \
            df.loc[~df[feature_col].isin([-np.inf, np.inf, np.nan]), feature_col].median()
    else:
        df = df.loc[~df[feature_col].isin([-np.inf, np.inf, np.nan])]
    
    return df

def get_bins(df: pd.DataFrame, 
             feature_col: str, 
             target_col: str,
             min_val_adj: float,
             max_val_adj: float,
             n_bins:int = 10, 
             method: str = 'quantile',
             bins: list = None) -> np.ndarray:
    """
    creates bins using various methods
    """
    
    # check: if method not manual then must be quantile or linear
    if method not in ['quantile', 'linear']:
        raise ValueError(f'{method} not one of the allowed methods, must be in [quantile, linear]')
        
    # Create equidistant grid
    if method =='quantile':
        
        # use this function to ensure we get nbin bins even when using quantiles
        bins = get_n_quantile_bins(
            df,
            feature_col,
            min_val_adj,
            max_val_adj,
            n_bins
        )
        
    else:
        
        # get bins using linspace
        bins = np.linspace(min_val_adj, max_val_adj, self.n_bins)
        
    return bins


def get_min_max(df: pd.DataFrame, feature_col: str) -> (float, float):
    """
    get min and max while ignoring nan values
    
    see: https://stackoverflow.com/questions/62865647/why-is-max-and-min-of-numpy-array-nan
    """
    
    # Get unadjusted min and max
    min_val = np.nanmin(df[feature_col])
    max_val = np.nanmax(df[feature_col])
        
    return (min_val, max_val)

def get_min_max_adj(df: pd.DataFrame, feature_col: str) -> (float, float):
    """
    get min and max while ignoring nan values and possible +/- inf
    
    see: https://stackoverflow.com/questions/62865647/why-is-max-and-min-of-numpy-array-nan
    """
    
    # Get unadjusted min and max
    min_val = np.nanmin(df[feature_col])
    max_val = np.nanmax(df[feature_col])
    
    # get min & max excluding +/- inf
    idx_neg_inf = df[feature_col] == -np.inf
    idx_pos_inf = df[feature_col] == np.inf
    min_val_excl_inf = np.nanmin(df[feature_col][~idx_neg_inf])
    max_val_excl_inf = np.nanmax(df[feature_col][~idx_pos_inf])
    
    # take largest minimum and smallest maximum
    min_val_adj = np.maximum(min_val, min_val_excl_inf)
    max_val_adj = np.minimum(max_val, max_val_excl_inf)
    
    return (min_val_adj, max_val_adj)

def labels_from_bins(bins: list) -> list:
    """
    generate labels from bins
    
    example: [0, 1, 2] -> ['(0.00, 1.00]', '(1.00, 2.00]']
    """
    labels = [
        f'({"{:,.2f}".format(bins[i])}, {"{:,.2f}".format(bins[i+1])}]'
        for i in range(len(bins) - 1)
    ]
    return labels

In [5]:
class PlotProtocol(Protocol):
    """
    Defines an interface for all the plots that can be combined in the OtaPlotter Object
    """
    
    def do_math(self, df, feature_col, target_col, fillna: bool = False):
        """
        does the required math to generate the traces, annotations and axes for the roc-curve plot
        """
        ...
    
    def get_traces(self) -> List:
        ...
    
    def get_x_axes_layout(self, row, col):
        ...
    
    def get_y_axes_layout(self, row, col):
        ...
    
    def get_annotations(self, ref) -> List:
        ...

@dataclass
class PlotColors:
    """
    container for colors to use
    """
    
    primary_color: str = '40, 186, 116'
    secondary_color: str = '41, 94, 126'
    tertiary_color: str = '153, 204, 235'
    grey_tint_color: str = '110, 111, 115'
    
    
    def __post_init__(self):
        self.colors = {
            "primary_color"  : self.primary_color, 
            "secondary_color" : self.secondary_color, 
            "tertiary_color" : self.tertiary_color, 
            "grey_tint_color" : self.grey_tint_color}
        
    def get_rgba(self, color: str = 'primary_color', opacity: float = 1):
        
        if color not in self.colors.keys():
            raise ValueError(f"{color} is not one of the colors, choose from: {list(self.colors.keys())}")
        
        return 'rgba(' + self.colors[color] + f', {opacity})'
    
    def get_grey_rgba(self, opacity: float = 1):
        
        return 'rgba(' + self.colors['grey_tint_color'] + f', {opacity})'

In [6]:
@dataclass
class RocCurvePlot(PlotProtocol):
    
    # set the colorway
    colors: PlotColors = field(default_factory = lambda: PlotColors())
        
    # set hover setting
    hoverinfo: str = field(default_factory = lambda: 'skip')
        
    def do_math(self, df, feature_col, target_col, fillna: bool = False):
        """
        does the required math to generate the traces, annotations and axes for the roc-curve plot
        
        1. imputes/removes missing values
        2. calculated fpr, tpr and AUC
        """
        
        # 1. impute/remove missing values
        self.df_imputed = remove_or_impute_nan_infs(df.copy(), feature_col, target_col)
        
        # 2. calculate fpr, tpr and AUC
        self.fpr, self.tpr, _ = metrics.roc_curve(
            self.df_imputed[target_col], -self.df_imputed[feature_col], pos_label=1
        )
        self.auc = metrics.auc(self.fpr, self.tpr)
        
        if self.auc < 0.5:
            self.fpr, self.tpr, _ = metrics.roc_curve(
                self.df_imputed[target_col], self.df_imputed[feature_col], pos_label=1
            )
            self.auc = metrics.auc(self.fpr, self.tpr)
    
    def get_traces(self):
        
        return [
            
            # plot the roc-curve
            (
                go.Scatter(
                    x=self.fpr, 
                    y=self.tpr,
                    mode='lines',
                    line= dict(
                        color = self.colors.get_rgba(),
                        width = 1.5,
                    ),
                    hoverinfo=self.hoverinfo,
                    showlegend=False,
                ),
                
                #share y 
                False
            ),
            
            # plot the baseline
            (
                go.Scatter(
                    x=np.linspace(0, 1, 10), 
                    y=np.linspace(0, 1, 10),
                    mode='lines',
                    line= dict(
                        color = self.colors.get_grey_rgba(),
                        dash  = 'dash',
                        width = 0.5,
                    ),
                    hoverinfo=self.hoverinfo,
                    showlegend=False,
                ),
                
                #share y 
                False
            )
        ]
    
    def get_x_axes_layout(self, row, col):
        return dict(
            title_text="Cumulated goods", 
            title_font = {"size": 12},
            range=[0, 1],
            row=row, 
            col=col,
            title_standoff = 5 #decrease space between title and plot
        )
    
    def get_y_axes_layout(self, row, col):
        return dict(
            title_text="Cumulated bads", 
            title_font = {"size": 12},
            range=[0, 1.05],
            row=row, 
            col=col,
            title_standoff = 5 #decrease space between title and plot
        )
    
    def get_annotations(self, xref, yref):
        return [dict(
                    x=0.65,
                    y=0.1,
                    xref=xref,
                    yref=yref,
                    text=f"area = {self.auc:.3f}",
                    showarrow=False,
                    bordercolor='rgba(255,255,255,1)',
                    borderwidth=2,
                    borderpad=4,
                    bgcolor='rgba(255,255,255,1)',
                    opacity=0.8
        )]

In [7]:
@dataclass
class DistPlot(PlotProtocol):
    
    # set the colorway
    colors: PlotColors = field(default_factory = lambda: PlotColors())
        
    # set hover setting
    hoverinfo: str = field(default_factory = lambda: 'skip')
    
    def do_math(self, df, feature_col, target_col, fillna: bool = False):
        """
        does the required math to generate the traces, annotations and axes for the roc-curve plot
        
        1. imputes/removes missing values
        2. extract traces from the distplot function from plotly
        3. get the max density and feature value after imputing 
        """
        
        # 1. impute/remove missing values
        self.df_imputed = remove_or_impute_nan_infs(df.copy(), feature_col, target_col)
        
        # 2. extract traces from the distplot function from plotly
        self.hist_data = [
            self.df_imputed.loc[(self.df_imputed[target_col]==0), feature_col].values, 
            self.df_imputed.loc[(self.df_imputed[target_col]==1), feature_col].values
        ]
        self.group_labels = ['0', '1']
        self.distplot = ff.create_distplot(self.hist_data, self.group_labels)
        
        # 3. get the max density and feature value after imputing 
        self.max_density = max(self.distplot['data'][2].y.max(), self.distplot['data'][3].y.max())
        self.max_val_adj = self.df_imputed[feature_col].max()
    
    def get_traces(self):
        
        return [
            
            # plot the first distribution: 
            (
                go.Scatter(
                    self.distplot['data'][2],
                    line=dict(
                        color = self.colors.get_rgba(), 
                        width=0.5
                    ),
                    fill='tonexty',
                    fillcolor = self.colors.get_rgba(opacity=0.2),
                    hoverinfo=self.hoverinfo,
            ),
                
                #share y 
                False
             
            ),
            
            # plot the second distribution
            (
                go.Scatter(
                    self.distplot['data'][3],
                    line = dict(
                        color = self.colors.get_rgba('secondary_color'), 
                        width=0.5),
                    fill='tozeroy',
                    fillcolor= self.colors.get_rgba('secondary_color', opacity = 0.2),
                    hoverinfo=self.hoverinfo,
                ),
                
                #share y 
                False
            )
        ]
    
    def get_x_axes_layout(self, row, col):
        return None
    
    def get_y_axes_layout(self, row, col):
        return dict(
            title_text="Density", 
                title_font = {"size": 12},
                row=row, 
                col=col,
                title_standoff = 5 #decrease space between title and plot
        )
    
    def get_annotations(self, xref, yref):
        return [dict(
                    x=0.9 * self.max_val_adj,
                    y=1 * self.max_density,
                    xref=xref,
                    yref=yref,
                    text=f"Class: 0",
                    font=dict(
                        color= self.colors.get_rgba()
                    ),
                    showarrow=False,
                    bordercolor='rgba(255,255,255,1)',
                    borderwidth=2,
                    borderpad=4,
                    bgcolor='rgba(255,255,255,1)',
                    opacity=0.8
                    ),
                dict(
                    x=0.9 * self.max_val_adj,
                    y=0.9 * self.max_density,
                    xref=xref,
                    yref=yref,
                    text=f"Class: 1",
                    font=dict(
                        color=self.colors.get_rgba('secondary_color')
                    ),
                    showarrow=False,
                    bordercolor='rgba(255,255,255,1)',
                    borderwidth=2,
                    borderpad=4,
                    bgcolor='rgba(255,255,255,1)',
                    opacity=0.8
                    ),
               ]

In [21]:
@dataclass
class BinEvenRatePlot(PlotProtocol):
    
    # set the colorway
    colors: PlotColors = field(default_factory = lambda: PlotColors())
    
    # set (number of) bins
    bins: list = field(default_factory = lambda: None)
    n_bins: int = field(default_factory = lambda: 10)
        
    # set hover setting
    hoverinfo: str = field(default_factory = lambda: 'skip')
    
    def do_math(self, df, feature_col, target_col, fillna: bool = False, method: str = 'quantile'):
        """
        does the required math to generate the traces, annotations and axes for the roc-curve plot
        
        1. imputes/removes missing values
        2. extract traces from the distplot function from plotly
        3. get the max density and feature value after imputing 
        """
        
        # set feature and target column names
        self.feature_col = feature_col
        self.target_col = target_col
        
        # make fresh copy of df
        self.df = df.copy()
        
        # Calculate global event rate
        self.event_rate = np.mean(self.df[target_col])

        # Adjust n_bins if less unique values exist
        self.n_bins = self.n_bins if self.bins is None else len(self.bins)
        n_unique_feat_vals = df[feature_col].nunique()
        self.n_bins = np.minimum(n_unique_feat_vals, self.n_bins)
        
        # Get unadjusted min and max
        min_val, max_val = get_min_max_adj(self.df, feature_col)
        min_val_adj, max_val_adj = get_min_max_adj(self.df, feature_col)
        
        ## BINNING
        self.bins = self.bins if self.bins is not None else get_bins(self.df,
                                                           self.feature_col, 
                                                           self.target_col,
                                                           min_val_adj,
                                                           max_val_adj,
                                                           n_bins = self.n_bins, 
                                                           method = method)
        # convert type to list
        self.bins = list(self.bins)
        
        # set first and last value back to min and max before imputing
        self.bins[0] = min_val
        self.bins[self.n_bins - 1] = max_val
        
        # update number of bins
        assert self.n_bins == len(self.bins)
        
        # Create bins (return None if binning is not successfull)
        try:
            # create new column indicating what bin record belongs to
            self.df = self.df.assign(
                    bins=pd.cut(
                        x=self.df.loc[:, feature_col],
                        bins=self.bins,
                        include_lowest=True,
                        right=True,
                        labels=False,
                    )
            )
        except Exception as e:
            _logger.warning(e)
            return None
        
        # Create plot labels: [(4, 6), (6, 10), ...]
        self.labels = labels_from_bins(self.bins)
        
        ## CLIPPING
        
        # Clip values according to min/max values
        df[feature_col].clip(lower=min_val, upper=max_val, inplace=True)

        # Ensure clipping values does not remove all but a single value
        if df[feature_col].nunique() < 2:
            _logger.info(
                "Feature contains less than 2 features after clipping outliers!!"
            )
            return None
        
        
        ## NA's

        # Handle NAs
        if self.df["bins"].isna().sum() > 0:
            
            # replace the NA bin w/ n_bins - 1
            self.df.loc[:, "bins"] = self.df.loc[:, "bins"].where(
                ~self.df.loc[:, "bins"].isna(), self.n_bins - 1
            )
            self.labels.append("NA")
            self.n_bins += 1

        # Convert bins to categories
        self.df.bins = self.df.bins.astype("category")

        # Set all categories
        self.df.bins = self.df.bins.cat.set_categories(list(range(self.n_bins - 1)))

        # Group into bins and calculate required metrics
        self.df_binned = self.df.groupby("bins").agg({feature_col: [len], target_col: ["mean"]})

        # Rename columns
        level_one = self.df_binned.columns.get_level_values(0).astype(str)
        level_two = self.df_binned.columns.get_level_values(1).astype(str)
        column_separator = ["_" if x != "" else "" for x in level_two]
        self.df_binned.columns = level_one + column_separator + level_two

        # Set NA counts to zero
        self.df_binned[f"{feature_col}_len"] = self.df_binned[f"{feature_col}_len"].fillna(0)

    
    def get_traces(self):
        
        return [
            
            # plot the first distribution: 
            (
                go.Bar(
                    x=self.labels, 
                    y=self.df_binned[f"{self.feature_col}_len"],
                    marker_color=self.colors.get_rgba('secondary_color', opacity = 0.1), 
                    marker_line_color=self.colors.get_rgba('secondary_color'),
                    marker_line_width=1.5, 
                    opacity=0.6,
                    showlegend=False,
                    hoverinfo=self.hoverinfo,
                ),
                
                #share y 
                False
            ),
            
            # plot binned event rate
            (
                go.Scatter(
                    x=self.labels, 
                    y= [self.event_rate] * len(self.labels),
                    mode='lines',
                    line= dict(
                        color = self.colors.get_grey_rgba(),
                        dash = 'dash',
                        width = 1,
                    ),
                    hoverinfo=self.hoverinfo,
                    name=f"General Event Rate: ({'{:.1%}'.format(self.event_rate)})",
                ),
                
                #share y 
                True
            ), 
            
            # plot eventrate baseline
            (
                go.Scatter(
                    x=self.labels, 
                    y=self.df_binned[f"{self.target_col}_mean"],
                    mode='lines+markers',
                    line= dict(
                        color = self.colors.get_rgba(),
                        width = 1,
                    ),
                    hoverinfo=self.hoverinfo,
                    name='Event Rate',
                ),
                #share y 
                True
            ),
        ]
    
    def get_x_axes_layout(self, row, col):
        return dict(
            title_text=f"{self.feature_col}", 
            title_font = {"size": 12},
            row=row, 
            col=col,
            title_standoff = 5, #decrease space between title and plot
            tickangle=22.5
        )
    
    def get_y_axes_layout(self, row, col):
        return dict(
            title_text=f"# Observations", 
            title_font = {"size": 12},
            #range=[0, 1.2 * self.df_binned[f"{self.feature_col}_len"].max()],
            row=row, 
            col=col,
            title_standoff = 5, #decrease space between title and plot
        )
    
    def get_annotations(self, xref, yref):
        return []

In [38]:
@dataclass
class OtaPlotter:
    """
    class implementing a single plot for a single variable 
    
    """
    
    feature_col: str
    target_col : str
    specs: list = field(default_factory=lambda: [[{}, {}], [{"colspan": 2, "secondary_y": True}, None]])
    nan_count_specs: int = field(default_factory=lambda: 0)
    
    def __post_init__(self):
        
        # make these in init later
        self.nrows = 2
        self.ncols = 2
        
        counter = 1
        self.xrefs = [[f'x?' for j, el in enumerate(row)] for i, row in enumerate(specs)]
        self.yrefs = [[f'y{i * self.ncols + j + 1}' for j, el in enumerate(row)] for i, row in enumerate(specs)]

        rows = range(self.nrows)
        cols = range(self.ncols)
        for i in rows:
            for j in cols:
                if self.specs[i][j] == None:
                    self.legend_xref = i
                else:
                    self.xrefs[i][j] = f'x{counter}'
                    #self.yrefs[i][j] = f'y{counter}'
                    counter+=1
        
        # make figure
        self.fig = make_subplots(
                    rows=self.nrows, 
                    cols=self.ncols,
                    vertical_spacing=0.15,   
                    horizontal_spacing=0.1, 
                    specs=self.specs,
                    )
        
        # update layout of figure
        self.fig.update_layout(
            showlegend=True
           ,title = dict(
               text = f"{self.feature_col}: Roc Curve | Densities | Event Rates", 
               font= dict(
                   size=20
               )
           )
            ,margin=dict(t=40)
            ,title_x=0.5
            ,width=1000
            ,height=800
            ,legend=dict(
                x=0.9,
                y=0.16 + (1-self.legend_xref)*0.5,
                xanchor="right",
                yanchor="bottom",
             )
        )
        
    
    def build_subplot(self, subplot, row, col):
        
        # add traces
        for trace, share_y in subplot.get_traces():
            self.fig.add_trace(
                trace,
                row = row, 
                col = col,
                secondary_y=share_y
            )
        
        # add annotations
        for annotation in subplot.get_annotations(self.xrefs[row-1][col-1], self.yrefs[row-1][col-1]):
            self.fig.add_annotation(
                **annotation
            )
        
        # update axes layout if specifed
        if subplot.get_x_axes_layout(row, col) is not None:
            self.fig.update_xaxes(
                **subplot.get_x_axes_layout(row, col)
            )
        
        if subplot.get_y_axes_layout(row, col) is not None:
            self.fig.update_yaxes(
                **subplot.get_y_axes_layout(row, col)
            )
    
    
    def show(self):
        """
        show plot
        """
        
        self.fig.show()
    
    
    def save_fig(self, directory: pathlib.Path):
        """
        save plot
        """
        
        # make directory if not exists
        if not directory.exists():
            raise ValueError(f"directory: {directory} does not exist")
        
        print(f'saving univariate anaylsis for {self.feature_col}')
        
        # save the image in the directory
        self.fig.write_image(directory / f"univariate_plot_{self.feature_col}.jpeg")
    
    def get_fig(self):
        return self.fig

In [50]:
def build_univariate_plot(df, 
                          feature_col: str, 
                          target: str, 
                          colors: PlotColors = PlotColors(), 
                          show_plot: bool = True,
                          hoverinfo='all',
                          n_bins: int = 10,
                          bins: list = None):
    """
    buils standard univariate plot from days 'ye
    
    Returns
    """
    roccurve = RocCurvePlot(hoverinfo=hoverinfo, colors=WEX_COLORS)
    roccurve.do_math(df_sample, feature_col, target)
    
    distcurve = DistPlot(hoverinfo=hoverinfo, colors=WEX_COLORS)
    distcurve.do_math(df_sample, feature_col, target)
    
    eventcurve = BinEvenRatePlot(hoverinfo=hoverinfo, colors=WEX_COLORS, n_bins=n_bins, bins=bins)
    eventcurve.do_math(df_sample, feature_col, target)
    
    specs = [
        [{}, {}],
        [{"colspan": 2, "secondary_y": True}, None]
    ]

    plot = OtaPlotter(feature_col, target, specs)
    plot.build_subplot(roccurve, 1, 1)
    plot.build_subplot(distcurve, 1, 2)
    plot.build_subplot(eventcurve, 2, 1)
    
    if show_plot:
        plot.show()
        
    return plot

In [63]:
@dataclass
class general_config():
    feature_name: str
    n_bins: str
    bins: str
    show_plot: bool
    colors: PlotColors

@dataclass
class feature_config():
    feature_name: str
    n_bins: str
    bins: str
    hoverinfo: str
    show_plot: bool

In [58]:
def build_univariate_plots(df, 
                       features: list, 
                       target: str, 
                       save_directory = pathlib.Path(),
                       colors: PlotColors = PlotColors(), 
                       show_plot: bool = False,
                       hoverinfo='all',
                       n_bins: int = 10,
                       bins: list = None):
    
    if isinstance(features, str):
        features = [features]
    
    for feature in features:
        print(feature)
        if feature not in df.columns:
            raise ValueError(f'{feature} not in columns of dataframe')

        fig = build_univariate_plot(df, 
                          feature, 
                          target, 
                          colors = colors, 
                          show_plot = show_plot,
                          hoverinfo= hoverinfo,
                          n_bins = n_bins,
                          bins = bins)
        
        figure.save_fig(save_directory)

In [40]:
df_sample.columns

Index(['CUSTOMER_ID', 'REF_DATE', 'DATE', 'PHYSICAL_STATE', 'PHYSICAL_CITY',
       'PHYSICAL_POSTAL_CODE', 'A_TENURE_MONTHS_N', 'A_TENURE_GROUP_C',
       'A_YEARS_IN_BUSINESS_N', 'A_NUMBER_OF_DRIVERS_N',
       'A_NUMBER_OF_EMPLOYEES_N', 'A_NUMBER_OF_TRUCKS_N', 'FRAUD_FLAG_FP',
       'FRAUD_FLAG_OPERATIONS', 'THIN_FILE_INDICATOR',
       'SELF_REPORTED_INDICATOR', 'HAS_PARENT', 'PLATFORM',
       'D_N_DELINQUENCIES_6M_N', 'D_N_DELINQUENCIES_1M_N',
       'D_MAX_DAYS_PAST_DUE_6M_N', 'D_MAX_DAYS_PAST_DUE_1M_N',
       'D_DAYS_PAST_DUE_CURRENT_N', 'D_DAYS_SINCE_LAST_30DPD_CURRENT_N',
       'DNB_FAILURE_SCORE_N', 'DNB_DELINQUENCY_SCORE_N',
       'LN_LEXISNEXIS_SCORE_CURRENT_N', 'LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N',
       'LN_LEXISNEXIS_THIN_FILE_FLAG_B', 'LN_LIMITED_CREDIT_HISTORY_FLAG_B',
       'BUSINESS_DATE_NEXT_60_DPD', 'DAYS_UNTIL_NEXT_60_DPD',
       'EXPOSURE_NEXT_60_DPD', 'FLAG_60_DPD_366_DAYS',
       'MC_MON_AMT_USED_FLAG_6M_C', 'T_SUM_GALLONS_1M',
       'T_TRX_AMT_TOTAL_

In [41]:
df_sample.columns
FEATURE = "D_MAX_DAYS_PAST_DUE_6M_N"
target = "FLAG_60_DPD_366_DAYS"

WEX_COLORS = PlotColors(
    primary_color = '231, 30, 87',
    secondary_color = '153, 204, 235',
    tertiary_color = '254, 189, 64',
    grey_tint_color = '110, 111, 115'
)

BCG_COLORS = PlotColors(
    primary_color = '40, 186, 116',
    secondary_color = '41, 94, 126',
    tertiary_color = '153, 204, 235',
    grey_tint_color = '110, 111, 115'
)

In [42]:
roccurve = RocCurvePlot(hoverinfo='all')
roccurve.do_math(df_sample, FEATURE, "FLAG_60_DPD_366_DAYS")

In [43]:
distcurve = DistPlot(hoverinfo='all')
distcurve.do_math(df_sample, FEATURE, "FLAG_60_DPD_366_DAYS")

In [44]:
eventcurve = BinEvenRatePlot(hoverinfo='all', n_bins=5, bins=[0,1,10,20,100])
eventcurve.do_math(df_sample, FEATURE, "FLAG_60_DPD_366_DAYS")

In [45]:
specs = [
    [{}, {}],
    [{"colspan": 2, "secondary_y": True}, None]
]

plot = OtaPlotter(FEATURE, "FLAG_60_DPD_366_DAYS", specs)
plot.build_subplot(roccurve, 1, 1)
plot.build_subplot(distcurve, 1, 2)
plot.build_subplot(eventcurve, 2, 1)
plot.show()

In [46]:
specs = [
    [{"colspan": 2, "secondary_y": True}, None],
    [{}, {}]
]

plot = OtaPlotter(FEATURE, "FLAG_60_DPD_366_DAYS", specs)
plot.build_subplot(roccurve, 2, 1)
plot.build_subplot(distcurve, 2, 2)
plot.build_subplot(eventcurve, 1, 1)
plot.show()

In [47]:
specs = [
    [{"colspan": 2, "secondary_y": True}, None],
    [{}, {}]
]

plot = OtaPlotter(FEATURE, "FLAG_60_DPD_366_DAYS", specs)
plot.build_subplot(roccurve, 2, 2)
plot.build_subplot(distcurve, 2, 1)
plot.build_subplot(eventcurve, 1, 1)
plot.show()

In [48]:
figure = build_univariate_plot(df_sample, FEATURE, "FLAG_60_DPD_366_DAYS", bins=[0, 1, 4, 7, 9, 10])

In [49]:
DATA_DIR = pathlib.Path().resolve().parent / 'data'
IMAGE_DIR = DATA_DIR / 'images'
IMAGE_DIR.mkdir(parents=True, exist_ok=True)

features = ["A_TENURE_MONTHS_N", "D_MAX_DAYS_PAST_DUE_6M_N", "LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N", ]
for feature in features:
    figure = build_univariate_plot(df_sample, feature, "FLAG_60_DPD_366_DAYS", colors=WEX_COLORS)
    figure.save_fig(IMAGE_DIR)

saving univariate anaylsis for A_TENURE_MONTHS_N


saving univariate anaylsis for D_MAX_DAYS_PAST_DUE_6M_N


saving univariate anaylsis for LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N


In [59]:
build_univariate_plots(
    df_sample,
    feature,
    "FLAG_60_DPD_366_DAYS",
    IMAGE_DIR
)

['LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N']
LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N
saving univariate anaylsis for LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N


In [61]:
build_univariate_plots(
    df_sample,
    ["A_TENURE_MONTHS_N", "D_MAX_DAYS_PAST_DUE_6M_N", "LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N", "poop"],
    "FLAG_60_DPD_366_DAYS",
    IMAGE_DIR
)

A_TENURE_MONTHS_N
saving univariate anaylsis for LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N
D_MAX_DAYS_PAST_DUE_6M_N
saving univariate anaylsis for LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N
LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N
saving univariate anaylsis for LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N
poop


ValueError: poop not in columns of dataframe

In [62]:
build_univariate_plots(
    df_sample,
    ["A_TENURE_MONTHS_N", "D_MAX_DAYS_PAST_DUE_6M_N", "LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N", "poop"],
    "FLAG_60_DPD_366_DAYS",
    IMAGE_DIR / "bla"
)

A_TENURE_MONTHS_N


ValueError: directory: /Users/otaniels/Library/CloudStorage/OneDrive-TheBostonConsultingGroup,Inc/Documents/Code/univariate_plotter/data/images/bla does not exist