In [1]:
import numpy as np

from skorecard import datasets
from skorecard.bucketing.bucketing import BucketTransformer as BT

In [2]:
df = datasets.load_uci_credit_card(as_frame=True)
df.head(10)

Unnamed: 0,EDUCATION,MARRIAGE,LIMIT_BAL,BILL_AMT1,default
0,1,2,400000.0,201800.0,0
1,2,2,80000.0,80610.0,0
2,1,2,500000.0,499452.0,0
3,1,1,140000.0,450.0,1
4,2,1,420000.0,56107.0,0
5,1,1,280000.0,47.0,0
6,1,1,210000.0,3035.0,0
7,2,1,50000.0,13226.0,0
8,3,1,50000.0,650.0,0
9,3,1,140000.0,136918.0,0


In [3]:
X = df[['LIMIT_BAL', 'BILL_AMT1']].values

## Simple Transformer

In [4]:
MyBucketTransformer = BT(bin_count=5, method='simple')

In [5]:
MyBucketTransformer.fit(X)

BucketTransformer(bin_count=5, mapping=None, method='simple')

In [6]:
X_transformed = MyBucketTransformer.transform(X)

In [7]:
np.bincount(X_transformed[:,1].astype(int))

array([   3, 5408,  490,   75,   24])

In [8]:
MyBucketTransformer.BucketDict['Bucketer_simple_feature_0']

SimpleBucketer
	bincount: 5
Results:
	counts: [3285 1783  660  251   21]
	boundaries: [ 10000. 160000. 310000. 460000. 610000. 760000.]

In [9]:
import yaml

In [10]:
MyBucketTransformer.save('tmpdict.yaml')

In [11]:
import yaml

with open('tmpdict.yaml', 'r') as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [13]:
config['Feature 0']

{'bin_count': 5,
 'boundaries': [10000.0, 160000.0, 310000.0, 460000.0, 610000.0, 760000.0],
 'method': 'simple'}

## Agglomerative Transformer

In [None]:
MyBucketTransformer = BT(bin_count=5, method='agglomerative')

In [None]:
MyBucketTransformer.fit(X)

In [None]:
X_transformed = MyBucketTransformer.transform(X)

In [None]:
np.bincount(X_transformed[:,1].astype(int))

## Quantile Transformer

In [None]:
MyBucketTransformer = BT(bin_count=5, method='quantile')

In [None]:
MyBucketTransformer.fit(X)

In [None]:
X_transformed = MyBucketTransformer.transform(X)

In [None]:
np.bincount(X_transformed[:,1].astype(int))

## Save Transformer

In [None]:
MyBucketTransformer.save('Example_YAML')

## Example Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
df = datasets.load_uci_credit_card(as_frame=True)
df.head(10)

In [None]:
X, y = (
        df[["EDUCATION", "MARRIAGE", "LIMIT_BAL", "BILL_AMT1"]].values,
        df["default"].values,
    )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
lr = LogisticRegression()

In [None]:
pipe = Pipeline(steps=[('Bucket', BT(bin_count=2, method='simple')),
                ('logistic', lr)
               ])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
roc_auc_score(y_test, pipe.predict(X_test))