In [None]:
%load_ext autoreload
%autoreload 2

from skorecard import datasets

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score


# from skorecard.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression

# from feature_engine.discretisers import EqualWidthDiscretiser

import pandas as pd
import numpy as np

### Load Data

In [None]:
X, y = datasets.load_uci_credit_card(return_X_y=True)

In [None]:
X.head()

In [None]:
X['EDUCATION'].value_counts()

In [None]:
X['MARRIAGE'].value_counts()

In [None]:
X['LIMIT_BAL'].describe()

In [None]:
X['BILL_AMT1'].describe()

### Specify categorical and numerical columns

#### We'll autodetect using dabl.detect_types

In [None]:
from dabl import detect_types

In [None]:
detected_types = detect_types(X)
detected_types

In [None]:
cat_columns = X.columns[(detected_types['categorical']==True) | (detected_types['low_card_int']==True)]

In [None]:
num_columns = X.columns[(detected_types['continuous']==True) | (detected_types['dirty_float']==True)]
print(f"cat_columns = {cat_columns}")
print(f"num_columns = {num_columns}")

## Bucketers

### EqualWidthBucketer

In [None]:
from skorecard.bucketers import EqualWidthBucketer

In [None]:
EWB = EqualWidthBucketer(bins = 5) # show non-int

In [None]:
EWB.fit(X['LIMIT_BAL']) # this breaks - needs pandas df

In [None]:
EWB.fit(X[['LIMIT_BAL']])

In [None]:
EWB.bins

In [None]:
EWB.bucketer # Note the equal (well, almost) widths!

In [None]:
EWB.features_bucket_mapping_

In [None]:
X_transform = EWB.transform(X[['LIMIT_BAL']])

In [None]:
X_transform

In [None]:
X_transform['LIMIT_BAL'].value_counts()

### EqualFrequencyBucketer

In [None]:
from skorecard.bucketers import EqualFrequencyBucketer

In [None]:
EFB = EqualFrequencyBucketer(bins = 5)

In [None]:
EFB.fit(X[['BILL_AMT1']])

In [None]:
EFB.variables

In [None]:
EFB.bucketer # Note the counts

In [None]:
X_transform = EFB.transform(X[['BILL_AMT1']])

In [None]:
X_transform.head()

In [None]:
X_transform.value_counts()

#### Not always perfect: LIMIT_BAL

In [None]:
EFB = EqualFrequencyBucketer(bins = 5)
EFB.fit(X[['LIMIT_BAL']])
EFB.bucketer # Note the counts

### Agglomerative

In [None]:
from skorecard.bucketers import AgglomerativeClusteringBucketer

In [None]:
ACB = AgglomerativeClusteringBucketer(bins = 5)

In [None]:
ACB.fit(X[['BILL_AMT1']])

In [None]:
ACB.bucketer

In [None]:
X_transform = ACB.transform(X[['BILL_AMT1']])

In [None]:
X_transform.head()

In [None]:
X_transform.value_counts()

### Categoricals

In [None]:
from skorecard.bucketers import OrdinalCategoricalBucketer

In [None]:
OCB = OrdinalCategoricalBucketer(bins=3) # this fails

In [None]:
X['MARRIAGE'].value_counts(normalize=True)

In [None]:
OCB = OrdinalCategoricalBucketer(tol=0.15, max_n_categories=None)

In [None]:
OCB.fit_transform(X[['BILL_AMT1']])

In [None]:
OCB = OrdinalCategoricalBucketer(tol=0.15, max_n_categories=None)

In [None]:
OCB.fit(X[['MARRIAGE']])

In [None]:
X_transform = OCB.transform(X[['MARRIAGE']])

In [None]:
X_transform.value_counts(normalize=True)

### And if we increase tol?

In [None]:
OCB = OrdinalCategoricalBucketer(tol=0.50, max_n_categories=None)
OCB.fit(X[['MARRIAGE']])
X_transform = OCB.transform(X[['MARRIAGE']])
X_transform.value_counts(normalize=True)

### DecisionTreeBucketer

In [None]:
from skorecard.bucketers import DecisionTreeBucketer

In [None]:
DTB = DecisionTreeBucketer()

In [None]:
DTB.fit(X[['LIMIT_BAL']], y)

In [None]:
X_transform = DTB.transform(X[['LIMIT_BAL']])

In [None]:
X_transform['LIMIT_BAL'].value_counts()

In [None]:
DTB.bucketer

In [None]:
DTB.features_bucket_mapping_

In [None]:
DTB.get_params()

## Making a pipeline

In [None]:
bucket_pipeline = make_pipeline(
    EqualWidthBucketer(bins=5, variables=list(num_columns)),
    OrdinalCategoricalBucketer(variables=list(cat_columns))
)

In [None]:
bucket_pipeline

In [None]:
bucket_pipeline.named_steps

In [None]:
pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

In [None]:
pipeline

In [None]:
pipeline.named_steps

In [None]:
pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

In [None]:
bucket_pipeline = make_pipeline(
    DecisionTreeBucketer(max_depth=4,min_samples_leaf=0.1, variables=list(num_columns)),
    OrdinalCategoricalBucketer(variables=list(cat_columns))
)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

In [None]:
cat_columns

In [None]:
# Using a ColumnTransformer


bucket_pipeline = ColumnTransformer([
    ('categorical_preprocessing', OrdinalCategoricalBucketer(),cat_columns),
], remainder="passthrough")

ohe_pipeline = ColumnTransformer([
    ('ohe_cat_preprocessing', OneHotEncoder(),[0,1]),
], remainder="passthrough")

# bucket_pipeline.fit_transform(X, y)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('ohe', ohe_pipeline),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

In [None]:
# Using a ColumnTransformer


bucket_pipeline = ColumnTransformer([
    ('categorical_preprocessing', OrdinalCategoricalBucketer(),cat_columns),
    ('numerical_preprocessing', DecisionTreeBucketer(max_depth=4,min_samples_leaf=0.1), ['LIMIT_BAL','BILL_AMT1'])
], remainder="passthrough")

ohe_pipeline = ColumnTransformer([
    ('ohe_cat_preprocessing', OneHotEncoder(),[0,1]),
], remainder="passthrough")

# bucket_pipeline.fit_transform(X, y)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('ohe', ohe_pipeline),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"