In [None]:
%load_ext autoreload
%autoreload 2

from skorecard import datasets

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score


# from skorecard.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression

# from feature_engine.discretisers import EqualWidthDiscretiser

import pandas as pd
import numpy as np

### Load Data

in `skorecard` there is a demo dataset with 4 features (2 categorical and 2 numerical) for demo and testing.<br>
We'll use this one here

In [None]:
X, y = datasets.load_uci_credit_card(return_X_y=True)

In [None]:
X.head()

In [None]:
X['EDUCATION'].value_counts()

In [None]:
X['MARRIAGE'].value_counts()

In [None]:
X['LIMIT_BAL'].describe()

In [None]:
X['BILL_AMT1'].describe()

### Specify categorical and numerical columns

#### We'll autodetect using dabl.detect_types

In [None]:
from dabl import detect_types

In [None]:
detected_types = detect_types(X)
detected_types

In [None]:
cat_columns = X.columns[(detected_types['categorical']==True) | (detected_types['low_card_int']==True)]

In [None]:
num_columns = X.columns[(detected_types['continuous']==True) | (detected_types['dirty_float']==True)]
print(f"cat_columns = {cat_columns}")
print(f"num_columns = {num_columns}")

# Bucketers

The core of `skorecard` are the bucketers.<br>

All bucketeres are `transformer`: in other words they can be used with `sklearn` pipelines

The bucketers rely on `probatus` to perform the bucketing.<br>

The bucketers are:

- EqualWidthBucketer (histogram)
- EqualFrequencyBucketer (quantiles)
- AgglomerativeClusteringBucketer (uses agglomerative clustering to put the data together)
- DecisionTreeBucketer (uses a decision tree to find the optimal bucketer)
- OrdinalCategoricalBucketer(used for categorical features)

### EqualWidthBucketer

In [None]:
from skorecard.bucketers import EqualWidthBucketer

In [None]:
EWB = EqualWidthBucketer(bins = 5) # show non-int

In [None]:
EWB.fit(X['LIMIT_BAL']) # this breaks - needs pandas df

In [None]:
EWB.fit(X[['LIMIT_BAL']])

Return the probatus bucketer

In [None]:
EWB.bucketer # Note the equal (well, almost) widths!

#### One of the main components of `skorecard`: `FeaturesBucketMapping`

`FeaturesBucketMapping` it's a dictionary extensions that stores the bucketing in a format that is shareable accross different componenets.

After the bucketer is fitted, the attirbute `features_bucket_mapping_`  is a dictionary with {'feature name': FeaturesBucketMapping}

In [None]:
EWB.features_bucket_mapping_

In [None]:
X_transform = EWB.transform(X[['LIMIT_BAL']])

In [None]:
X_transform

In [None]:
X_transform['LIMIT_BAL'].value_counts()

#### Multiple features at the same time

Every bucketer accepts a keyword `variables`, which is a list of columns you want to apply the bucketer

In [None]:
EWB = EqualWidthBucketer(bins = 5, variables=['LIMIT_BAL','BILL_AMT1']) # show non-int

EWB.fit_transform(X).head()

In [None]:
X.head()

### EqualFrequencyBucketer

In [None]:
from skorecard.bucketers import EqualFrequencyBucketer

In [None]:
EFB = EqualFrequencyBucketer(bins = 5)

In [None]:
EFB.fit(X[['BILL_AMT1']])

In [None]:
EFB.features_bucket_mapping_

In [None]:
EFB.bucketer # Note the counts

Note the counts!

In [None]:
X_transform = EFB.transform(X[['BILL_AMT1']])

In [None]:
X_transform.head()

In [None]:
X_transform.value_counts()

#### Not always perfect: LIMIT_BAL

In [None]:
EFB = EqualFrequencyBucketer(bins = 5)
EFB.fit(X[['LIMIT_BAL']])
EFB.bucketer # Note the counts

When the feature is skewed, it's not ideal!

In [None]:
X[['LIMIT_BAL']].hist()

### Agglomerative

In [None]:
from skorecard.bucketers import AgglomerativeClusteringBucketer

In [None]:
ACB = AgglomerativeClusteringBucketer(bins = 5)

In [None]:
ACB.fit(X[['BILL_AMT1']])

In [None]:
ACB.bucketer

In [None]:
X_transform = ACB.transform(X[['BILL_AMT1']])

In [None]:
X_transform.head()

In [None]:
X_transform.value_counts()

### Categoricals

In [None]:
from skorecard.bucketers import OrdinalCategoricalBucketer

In [None]:
OCB = OrdinalCategoricalBucketer(tol=0.15, max_n_categories=None)

In [None]:
OCB.fit(X[['MARRIAGE']])

In [None]:
X_transform = OCB.transform(X[['MARRIAGE']])

Starting point

In [None]:
X['MARRIAGE'].value_counts(normalize=True)

After transformation - buckets get merged together

In [None]:
X_transform.value_counts(normalize=True)

### And if we increase tol?

In [None]:
OCB = OrdinalCategoricalBucketer(tol=0.50, max_n_categories=None)
OCB.fit(X[['MARRIAGE']])
X_transform = OCB.transform(X[['MARRIAGE']])
X_transform.value_counts(normalize=True)

### DecisionTreeBucketer

In [None]:
from skorecard.bucketers import DecisionTreeBucketer

In [None]:
DTB = DecisionTreeBucketer()

In [None]:
DTB.fit(X[['LIMIT_BAL']], y)

In [None]:
X_transform = DTB.transform(X[['LIMIT_BAL']])

In [None]:
X_transform['LIMIT_BAL'].unique()

In [None]:
DTB.bucketer

### Too many bins!!!

If you look at the decision tree, it used the sklearn default hyperparameters.<br>
Those trees overfit (as they grow as long as possible)

In [None]:
DTB = DecisionTreeBucketer(
    max_depth=4, # allow a maximum of 16 bins (2^4)
   min_samples_leaf=0.1, # do not allow bins to go below 10% ,
   # min_impurity_decrease = 0.00005
)

DTB.fit(
    X[['LIMIT_BAL']],
    y # Need the target for the training
)

Note that for every bin, it does not drop below 0.1

In [None]:
DTB.bucketer.counts/X.shape[0]

In [None]:
DTB.features_bucket_mapping_

# Making a sklearn pipeline

Putting together, there are multiple options!

Use make pipeline

In [None]:
bucket_pipeline = make_pipeline(
    EqualWidthBucketer(bins=5, variables=list(num_columns)),
    OrdinalCategoricalBucketer(variables=list(cat_columns))
)

In [None]:
bucket_pipeline

In [None]:
bucket_pipeline.named_steps

In [None]:
pipeline = Pipeline([
    ('bucketing', bucket_pipeline), ### Make the buckets
    ('one-hot-encoding', OneHotEncoder()), ### One-Hot encode them
    ('lr', LogisticRegression()) ### Pass through Logistic Regression
])

In [None]:
pipeline

In [None]:
pipeline.named_steps

In [None]:
pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

### Baseline check

One hot encode the categorical features, do nothing with the numericals

In [None]:
# Using a ColumnTransformer

ohe_pipeline = ColumnTransformer([
    ('ohe_cat_preprocessing', OneHotEncoder(),cat_columns),
], remainder="passthrough")

# bucket_pipeline.fit_transform(X, y)

pipeline = Pipeline([
    ('ohe', ohe_pipeline),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

### Try now with a decision tree bucketer

In [None]:
bucket_pipeline = make_pipeline(
    DecisionTreeBucketer(max_depth=4,min_samples_leaf=0.1, variables=list(num_columns)),
    OrdinalCategoricalBucketer(variables=list(cat_columns))
)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

#### Alternative - use the column transformer

In [None]:
# Using a ColumnTransformer

bucket_pipeline = ColumnTransformer([
    ('categorical_preprocessing', OrdinalCategoricalBucketer(), ['EDUCATION', 'MARRIAGE']),
    ('numerical_preprocessing', DecisionTreeBucketer(max_depth=4,min_samples_leaf=0.1), ['LIMIT_BAL','BILL_AMT1'])
], remainder="passthrough")

# bucket_pipeline.fit_transform(X, y)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

### Back to the best performant

In [None]:
bucket_pipeline = make_pipeline(
    DecisionTreeBucketer(max_depth=4,min_samples_leaf=0.1, variables=list(num_columns)),
    OrdinalCategoricalBucketer(tol=0.15,variables=list(cat_columns))
)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

### Extract the bucketers from the pipeline 
handy function is now WIP

In [None]:
all_features_bucket_mapping = bucket_pipeline.steps[0][1].features_bucket_mapping_.copy()

(
    all_features_bucket_mapping
    .update(bucket_pipeline.steps[1][1].features_bucket_mapping_)
)


In [None]:
all_features_bucket_mapping

# The user input bucketer

Probably, some manual tweaking might be required. 

Therefore ther eis a UserInputBucketer, that requires a `feature_bucket_mapping` and applies it over the dataframe

In [None]:
from skorecard.bucketers import UserInputBucketer

In [None]:
UIbucketers = UserInputBucketer(all_features_bucket_mapping)

Put it in a pipeine...

In [None]:

UI_pipeline = Pipeline([
    ('bucketing', UIbucketers),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

UI_pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, UI_pipeline.predict_proba(X)[:,1]):.4f}"

In [None]:
all_features_bucket_mapping

### Example of manual tweak

In [None]:
manual_input = all_features_bucket_mapping.copy()

In [None]:
from skorecard.bucket_mapping import BucketMapping

In [None]:
manual_input['BILL_AMT1'] = BucketMapping(
    feature_name='BILL_AMT1', 
    type='numerical', 
    map=[-165580.0, 293.5, 2559.5, 13047.0, 
       #  21671.5, #merge the 3rd and 4th bucket together
         36140.5, 62541.0, 101520.0, 610723.0], 
    right=True
)


In [None]:
manualUIbucketers = UserInputBucketer(manual_input)

manual_UI_pipeline = Pipeline([
    ('bucketing', manualUIbucketers),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

manual_UI_pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, manual_UI_pipeline.predict_proba(X)[:,1]):.4f}"

# The web app - still WIP

A webapp is now WIP that will allow to manually inspect and tweak the buckets, and then store them into a BucketMapping object that can be consumed by the model

In [None]:
from skorecard.apps import ManualBucketerApp

In [None]:
app = ManualBucketerApp(X,y, manualUIbucketers)

In [None]:
app.run_server()