In [1]:
import numpy as np

from skorecard import datasets
from skorecard.preprocessing import SimpleBucketTransformer
from skorecard.preprocessing import AgglomerativeBucketTransformer
from skorecard.preprocessing import QuantileBucketTransformer
from skorecard.preprocessing import TreeBucketTransformer

In [2]:
df = datasets.load_uci_credit_card(as_frame=True)
df.head(10)


Unnamed: 0,EDUCATION,MARRIAGE,LIMIT_BAL,BILL_AMT1,default
0,1,2,400000.0,201800.0,0
1,2,2,80000.0,80610.0,0
2,1,2,500000.0,499452.0,0
3,1,1,140000.0,450.0,1
4,2,1,420000.0,56107.0,0
5,1,1,280000.0,47.0,0
6,1,1,210000.0,3035.0,0
7,2,1,50000.0,13226.0,0
8,3,1,50000.0,650.0,0
9,3,1,140000.0,136918.0,0


In [None]:
df.shape

### Simple Example - Apply same bucket size to all features

In [None]:
X = df[['LIMIT_BAL', 'BILL_AMT1']].values
MyBucketTransformer = SimpleBucketTransformer(bin_count=5)

In [None]:
MyBucketTransformer.fit(X)

In [None]:
MyBucketTransformer.transform(X)

In [None]:
MyBucketTransformer.BucketDict

### Using a dictionary for different bucket sizes

In [None]:
X = df[['LIMIT_BAL', 'BILL_AMT1']].values

In [None]:
MyBucketTransformer = SimpleBucketTransformer(bin_count=[5, 7])

In [None]:
MyBucketTransformer.fit(X)

In [None]:
MyBucketTransformer.transform(X)

### Practical Example with Pipelines

In [None]:
from sklearn.compose import ColumnTransformer
from skorecard.preprocessing import ManualBucketTransformer

In [None]:
df = datasets.load_uci_credit_card(as_frame=True)
df.head(10)

In [None]:
# define transform
transformer = ColumnTransformer(
    transformers=[
        ('simple', SimpleBucketTransformer(bin_count=2), [1]),
        ('agglom', AgglomerativeBucketTransformer(bin_count=4), [0]),
        ('quantile', QuantileBucketTransformer(bin_count=[10, 6]), [2, 3])
    ],
    remainder='passthrough'
)

In [None]:
transformer.fit_transform(df.values)

In [None]:
transformer.named_transformers_.quantile.BucketDict

In [None]:
transformer.named_transformers_

In [None]:
example_boundary_dict = {}

for i in range(len(transformer.named_transformers_) - 1):
    key = list(transformer.named_transformers_.keys())[i]
    for j in range(len(transformer.transformers[i][2])):
        example_boundary_dict[transformer.transformers[i][2][j]]= transformer.named_transformers_[key].BucketDict[f'Feature_{j}'].boundaries

In [None]:
example_boundary_dict

In [None]:
MBT = ManualBucketTransformer(boundary_dict=example_boundary_dict)

In [None]:
X = df.copy()
MBT.fit(X.values)
a = MBT.transform(X.values)

In [None]:
np.unique(a[:,3])

In [None]:
# Tree Bucket Transformer

In [3]:
from skorecard.preprocessing import TreeBucketTransformer

In [4]:
X = df[['LIMIT_BAL', 'BILL_AMT1']].values
y = df['default']

In [30]:
tbt = TreeBucketTransformer(inf_edges=False, 
    max_depth=2,
   criterion = 'entropy',
   min_samples_leaf =2000 , #Minimum number of entries in the bins
   min_impurity_decrease=0.001)

In [31]:
tbt.fit(X, y)

100%|██████████| 2/2 [00:00<00:00, 11290.19it/s]
100%|██████████| 1/1 [00:00<00:00, 6403.52it/s]

(6000, 2)
(6000, 2)





TreeBucketTransformer()

In [36]:
tbt.transform(df[['LIMIT_BAL', 'BILL_AMT1', 'EDUCATION']].values)

KeyError: 'Feature_2'

In [33]:
tbt

TreeBucketTransformer()

In [34]:
tbt.BucketDict

{'Feature_0': TreeBucketer
 	bincount: 2
 Results:
 	counts: [3076 2924]
 	boundaries: [10000.0, 145000.0, 760000.0],
 'Feature_1': TreeBucketer
 	bincount: 1
 Results:
 	counts: [6000]
 	boundaries: [-165580.0, 610723.0]}