In [3]:
%load_ext autoreload
%autoreload 2

from skorecard.reporting import create_report 
from skorecard import datasets

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

from skorecard.bucketers import DecisionTreeBucketer, EqualWidthBucketer, OrdinalCategoricalBucketer

from sklearn.linear_model import LogisticRegression
from plotly.subplots import make_subplots

from dabl import detect_types

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import dash
import dash_html_components as html
import dash_core_components as dcc
from dash.dependencies import Input, Output
import plotly.express as px
import plotly.graph_objects as go
from jupyter_dash import JupyterDash
from dabl import detect_types
from sklearn.pipeline import make_pipeline
pd.options.plotting.backend = "plotly"

In [131]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.Div([
        dcc.Graph(id='barplot',
                  config={'displayModeBar': False},
                  animate=True)
    ]),
    
    html.Div([
        dcc.Slider(
            id='n_bin--slider',
            min=2,
            max=40,
            value=2,
            marks={str(i): str(i) for i in range(2, 40, 2)},
            step=None)
        ]),
    html.Div([
        dcc.Dropdown(
                id='dropdown-column',
                options=[{'label': i, 'value': i} for i in X.columns],
                value='LIMIT_BAL'
        )])
            ])


In [132]:
def generate_bucketed(n_bins, column):
    X, y = datasets.load_uci_credit_card(return_X_y=True)
    n_bins = int(n_bins)
    detected_types = detect_types(X)
    cat_columns = X.columns[(detected_types['categorical']==True) | (detected_types['low_card_int']==True)]
    num_columns = X.columns[(detected_types['continuous']==True) | (detected_types['dirty_float']==True)]

    bucket_pipeline = make_pipeline(
        EqualWidthBucketer(bins=n_bins, variables=list(num_columns)),
        OrdinalCategoricalBucketer(variables=list(cat_columns))
    )

    pipeline = Pipeline([
        ('bucketing', bucket_pipeline),
        ('one-hot-encoding', OneHotEncoder()),
        ('lr', LogisticRegression())
    ])

    pipeline.fit(X, y)
    auc = f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"
    
    
    bucketer=bucket_pipeline.named_steps['equalwidthbucketer']
    #column='LIMIT_BAL'
    X = X.copy()
    X_transform = bucketer.transform(X)
    df = pd.DataFrame(
        {
            "BUCKET": X_transform[column].value_counts().keys(),
            "NUMBER_IN_BUCKET": X_transform[column].value_counts().values,
            "PERCENTAGE_IN_BUCKET": X_transform[column].value_counts(normalize=True).values,
        }
    )
    X_transform["target"] = y

    # Defaults
    tmp = (
        X_transform.groupby([column])["target"].sum().reset_index().rename(columns={column: "BUCKET", "target": "BADS"})
    )

    # Merge defaults
    df = df.merge(tmp, how="left", on="BUCKET")

    # Default rates
    df["DEFAULT_RATE"] = df["BADS"] / df["NUMBER_IN_BUCKET"]  # todo: can we divide by 0 accidentally?

    # Get aggregations
    X_transform[f"{column}_ORIGINAL"] = X[column]
    tmp = X_transform.groupby([column]).agg({f"{column}_ORIGINAL": ["min", "max", "mean"]}).reset_index()

    # Rename due to .agg()
    tmp.columns = ["_".join(c) for c in tmp.columns.ravel()]
    tmp = tmp.rename(columns={f"{column}_": "BUCKET"})  # todo: there must be a cleaner way than this shit

    # Merge aggregations
    df = df.merge(tmp, how="left", on="BUCKET")
    
    return df, auc


def create_barplot(df, auc):
    bin_number = df.shape[0]
    df = df.sort_values('BUCKET')
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
#     fig.update_traces(mode='lines+markers')
    fig.update_traces()

    fig.update_xaxes(showgrid=False)

    fig.update_yaxes(showgrid=False)
    
    fig.add_annotation(x=0, y=0.85, xanchor='left', yanchor='bottom',
                       xref='paper', yref='paper', showarrow=False, align='left',
                       bgcolor='rgba(255, 255, 255, 0.5)', text=bin_number)

    # Add traces
    fig.add_trace(
        go.Bar(x=df['BUCKET'], y=df['PERCENTAGE_IN_BUCKET'], name="Percentages"),
        secondary_y=False
    )

    fig.add_trace(
        go.Scatter(x=df['BUCKET'], y=df['DEFAULT_RATE'], name="Default Rates"),
        secondary_y=True
    )
    fig.update_yaxes(title_text="Percentage", secondary_y=False)
    fig.update_yaxes(title_text="Default Rate", secondary_y=True)

    fig.update_layout(
        title=auc,
        xaxis_title="Bucket Number",
        font_family="Courier New"
    )
    return fig

@app.callback(dash.dependencies.Output('barplot', 'figure'),
              [dash.dependencies.Input('n_bin--slider', 'value'),
               dash.dependencies.Input('dropdown-column', 'value')])
def update_plot(n_bins, column):
    ''' Update plot with new n_bins '''

    df, auc = generate_bucketed(n_bins, column)

    return create_barplot(df, auc)

In [133]:
app.run_server(mode='jupyterlab', port = 8890, dev_tools_ui=True, #debug=True,
              dev_tools_hot_reload =True, threaded=True)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

In [96]:
tmp = generate_bucketed(12)
create_barplot(tmp)

TypeError: generate_bucketed() missing 1 required positional argument: 'column'


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

In [54]:
tmp

Unnamed: 0,BUCKET,NUMBER_IN_BUCKET,PERCENTAGE_IN_BUCKET,BADS,DEFAULT_RATE,LIMIT_BAL_ORIGINAL_min,LIMIT_BAL_ORIGINAL_max,LIMIT_BAL_ORIGINAL_mean
0,1,1861,0.310167,577,0.310048,10000.0,70000.0,40795.271359
1,2,1066,0.177667,260,0.243902,80000.0,130000.0,100975.609756
2,4,990,0.165,170,0.171717,200000.0,260000.0,223272.727273
3,3,829,0.138167,147,0.177322,140000.0,190000.0,161351.025332
4,5,428,0.071333,61,0.142523,270000.0,320000.0,294088.785047
5,6,350,0.058333,63,0.18,330000.0,380000.0,355342.857143
6,8,258,0.043,36,0.139535,450000.0,510000.0,488798.449612
7,7,173,0.028833,23,0.132948,390000.0,440000.0,411387.283237
8,10,21,0.0035,4,0.190476,580000.0,630000.0,604285.714286
9,9,14,0.002333,3,0.214286,520000.0,570000.0,538571.428571


In [85]:
X, y = datasets.load_uci_credit_card(return_X_y=True)
detected_types = detect_types(X)
cat_columns = X.columns[(detected_types['categorical']==True) | (detected_types['low_card_int']==True)]
num_columns = X.columns[(detected_types['continuous']==True) | (detected_types['dirty_float']==True)]


In [86]:
num_columns

Index(['LIMIT_BAL', 'BILL_AMT1'], dtype='object')

In [7]:
X, y = datasets.load_uci_credit_card(return_X_y=True)
detected_types = detect_types(X)
cat_columns = X.columns[(detected_types['categorical']) | (detected_types['low_card_int']==True)]
num_columns = X.columns[(detected_types['continuous']) | (detected_types['dirty_float']==True)]


In [8]:
cat_columns

Index(['EDUCATION', 'MARRIAGE'], dtype='object')

In [9]:
cat_columns = X.columns[(detected_types['categorical']) | (detected_types['low_card_int'])]

In [10]:
cat_columns

Index(['EDUCATION', 'MARRIAGE'], dtype='object')