In [16]:
import pathlib

import sys
ROOT = pathlib.Path().resolve().parent

sys.path.append(str(ROOT / 'src'))
#sys.path.append("../utils/")

import pandas as pd

pd.set_option('display.max_columns', None)

import plotly.io as pio
pio.renderers.default = "iframe"

In [2]:
import plots

In [3]:
from plots import RocCurvePlot, DistPlot, BinEventRatePlot
from plota import OtaPlotter

In [4]:
plots.DistPlot

plots.DistPlot.DistPlot

In [5]:
from config import settings
from colors import PlotColors

In [6]:
settings

Settings(file_path=PosixPath('/Users/otaniels/Library/CloudStorage/OneDrive-TheBostonConsultingGroup,Inc/Documents/NielsOta/Code/univariate_plotter/data/test_sample.csv'), output_path=PosixPath('/Users/otaniels/Library/CloudStorage/OneDrive-TheBostonConsultingGroup,Inc/Documents/NielsOta/Code/univariate_plotter/data/images'), features=['A_TENURE_MONTHS_N', 'D_MAX_DAYS_PAST_DUE_6M_N', 'LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N'], target='FLAG_60_DPD_366_DAYS')

In [7]:
df_sample = pd.read_csv(settings.file_path)

In [8]:
def build_univariate_plot(df, 
                          feature_col: str, 
                          target: str, 
                          colors: PlotColors = PlotColors(), 
                          show_plot: bool = True,
                          hoverinfo='all',
                          n_bins: int = 10,
                          bins: list = None):
    """
    buils standard univariate plot from days 'ye
    
    Returns
    """
    roccurve = RocCurvePlot(hoverinfo=hoverinfo, colors=colors)
    roccurve.do_math(df_sample, feature_col, target)
    
    distcurve = DistPlot(hoverinfo=hoverinfo, colors=colors)
    distcurve.do_math(df_sample, feature_col, target)
    
    eventcurve = BinEventRatePlot(hoverinfo=hoverinfo, colors=colors, n_bins=n_bins, bins=bins)
    eventcurve.do_math(df_sample, feature_col, target)
    
    specs = [
        [{}, {}],
        [{"colspan": 2, "secondary_y": True}, None]
    ]

    plot = OtaPlotter(feature_col, target, specs)
    plot.build_subplot(roccurve, 1, 1)
    plot.build_subplot(distcurve, 1, 2)
    plot.build_subplot(eventcurve, 2, 1)
    
    if show_plot:
        plot.show()
        
    return plot

In [22]:
def build_univariate_plots(df, 
                       features: list, 
                       target: str, 
                       save_directory = pathlib.Path(),
                       colors: PlotColors = PlotColors(), 
                       show_plot: bool = False,
                       hoverinfo='all',
                       n_bins: int = 10,
                       bins: list = None):
    
    if isinstance(features, str):
        features = [features]
    
    for feature in features:
        print(feature)
        if feature not in df.columns:
            raise ValueError(f'{feature} not in columns of dataframe')

        fig = build_univariate_plot(df, 
                          feature, 
                          target, 
                          colors = colors, 
                          show_plot = show_plot,
                          hoverinfo= hoverinfo,
                          n_bins = n_bins,
                          bins = bins)
        
        fig.save_fig(save_directory)

In [10]:
df_sample.columns

Index(['CUSTOMER_ID', 'REF_DATE', 'DATE', 'PHYSICAL_STATE', 'PHYSICAL_CITY',
       'PHYSICAL_POSTAL_CODE', 'A_TENURE_MONTHS_N', 'A_TENURE_GROUP_C',
       'A_YEARS_IN_BUSINESS_N', 'A_NUMBER_OF_DRIVERS_N',
       'A_NUMBER_OF_EMPLOYEES_N', 'A_NUMBER_OF_TRUCKS_N', 'FRAUD_FLAG_FP',
       'FRAUD_FLAG_OPERATIONS', 'THIN_FILE_INDICATOR',
       'SELF_REPORTED_INDICATOR', 'HAS_PARENT', 'PLATFORM',
       'D_N_DELINQUENCIES_6M_N', 'D_N_DELINQUENCIES_1M_N',
       'D_MAX_DAYS_PAST_DUE_6M_N', 'D_MAX_DAYS_PAST_DUE_1M_N',
       'D_DAYS_PAST_DUE_CURRENT_N', 'D_DAYS_SINCE_LAST_30DPD_CURRENT_N',
       'DNB_FAILURE_SCORE_N', 'DNB_DELINQUENCY_SCORE_N',
       'LN_LEXISNEXIS_SCORE_CURRENT_N', 'LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N',
       'LN_LEXISNEXIS_THIN_FILE_FLAG_B', 'LN_LIMITED_CREDIT_HISTORY_FLAG_B',
       'BUSINESS_DATE_NEXT_60_DPD', 'DAYS_UNTIL_NEXT_60_DPD',
       'EXPOSURE_NEXT_60_DPD', 'FLAG_60_DPD_366_DAYS',
       'MC_MON_AMT_USED_FLAG_6M_C', 'T_SUM_GALLONS_1M',
       'T_TRX_AMT_TOTAL_

In [11]:
df_sample.columns
FEATURE = "D_MAX_DAYS_PAST_DUE_6M_N"
target = "FLAG_60_DPD_366_DAYS"

SOMEBANK_COLORS = PlotColors(
    primary_color = '231, 30, 87',
    secondary_color = '153, 204, 235',
    tertiary_color = '254, 189, 64',
    grey_tint_color = '110, 111, 115'
)

BCG_COLORS = PlotColors(
    primary_color = '40, 186, 116',
    secondary_color = '41, 94, 126',
    tertiary_color = '153, 204, 235',
    grey_tint_color = '110, 111, 115'
)

In [12]:
roccurve = RocCurvePlot(hoverinfo='all')
roccurve.do_math(df_sample, FEATURE, "FLAG_60_DPD_366_DAYS")

In [13]:
distcurve = DistPlot(hoverinfo='all')
distcurve.do_math(df_sample, FEATURE, "FLAG_60_DPD_366_DAYS")

In [14]:
eventcurve = BinEventRatePlot(hoverinfo='all', n_bins=5, bins=[0,1,10,20,100])
eventcurve.do_math(df_sample, FEATURE, "FLAG_60_DPD_366_DAYS")

In [17]:
specs = [
    [{}, {}],
    [{"colspan": 2, "secondary_y": True}, None]
]

plot = OtaPlotter(FEATURE, "FLAG_60_DPD_366_DAYS", specs)
plot.build_subplot(roccurve, 1, 1)
plot.build_subplot(distcurve, 1, 2)
plot.build_subplot(eventcurve, 2, 1)
plot.show()

In [18]:
specs = [
    [{"colspan": 2, "secondary_y": True}, None],
    [{}, {}]
]

plot = OtaPlotter(FEATURE, "FLAG_60_DPD_366_DAYS", specs)
plot.build_subplot(roccurve, 2, 1)
plot.build_subplot(distcurve, 2, 2)
plot.build_subplot(eventcurve, 1, 1)
plot.show()

In [19]:
specs = [
    [{"colspan": 2, "secondary_y": True}, None],
    [{}, {}]
]

plot = OtaPlotter(FEATURE, "FLAG_60_DPD_366_DAYS", specs)
plot.build_subplot(roccurve, 2, 2)
plot.build_subplot(distcurve, 2, 1)
plot.build_subplot(eventcurve, 1, 1)
plot.show()

In [26]:
build_univariate_plots(
    df_sample,
    settings.features,
    settings.target,
    settings.output_path
)

A_TENURE_MONTHS_N
saving univariate anaylsis for A_TENURE_MONTHS_N
D_MAX_DAYS_PAST_DUE_6M_N
saving univariate anaylsis for D_MAX_DAYS_PAST_DUE_6M_N
LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N
saving univariate anaylsis for LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N


In [None]:
build_univariate_plots(
    df_sample,
    ["A_TENURE_MONTHS_N", "D_MAX_DAYS_PAST_DUE_6M_N", "LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N", "poop"],
    "FLAG_60_DPD_366_DAYS",
    IMAGE_DIR
)

In [None]:
build_univariate_plots(
    df_sample,
    ["A_TENURE_MONTHS_N", "D_MAX_DAYS_PAST_DUE_6M_N", "LN_LEXISNEXIS_SBFE_SCORE_CURRENT_N", "poop"],
    "FLAG_60_DPD_366_DAYS",
    IMAGE_DIR / "bla"
)