In [1]:
%load_ext autoreload
%autoreload 2

from skorecard import datasets

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score


# from skorecard.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression

# from feature_engine.discretisers import EqualWidthDiscretiser

import pandas as pd
import numpy as np

### Load Data

In [2]:
X, y = datasets.load_uci_credit_card(return_X_y=True)

In [3]:
X.head()

Unnamed: 0,EDUCATION,MARRIAGE,LIMIT_BAL,BILL_AMT1
0,1,2,400000.0,201800.0
1,2,2,80000.0,80610.0
2,1,2,500000.0,499452.0
3,1,1,140000.0,450.0
4,2,1,420000.0,56107.0


In [4]:
X['EDUCATION'].value_counts()

2    2725
1    2186
3    1013
5      51
4      13
6      11
0       1
Name: EDUCATION, dtype: int64

In [5]:
X['MARRIAGE'].value_counts()

2    3138
1    2784
3      64
0      14
Name: MARRIAGE, dtype: int64

In [6]:
X['LIMIT_BAL'].describe()

count      6000.000000
mean     168855.000000
std      131970.103911
min       10000.000000
25%       50000.000000
50%      140000.000000
75%      240000.000000
max      760000.000000
Name: LIMIT_BAL, dtype: float64

In [7]:
X['BILL_AMT1'].describe()

count      6000.000000
mean      51958.809500
std       74508.518453
min     -165580.000000
25%        3645.750000
50%       23371.500000
75%       67381.750000
max      610723.000000
Name: BILL_AMT1, dtype: float64

### Specify categorical and numerical columns

#### We'll autodetect using dabl.detect_types

In [8]:
from dabl import detect_types

In [9]:
detected_types = detect_types(X)
detected_types

Unnamed: 0,continuous,dirty_float,low_card_int,categorical,date,free_string,useless
EDUCATION,False,False,True,False,False,False,False
MARRIAGE,False,False,False,True,False,False,False
LIMIT_BAL,True,False,False,False,False,False,False
BILL_AMT1,True,False,False,False,False,False,False


In [10]:
cat_columns = X.columns[(detected_types['categorical']==True) | (detected_types['low_card_int']==True)]

In [11]:
num_columns = X.columns[(detected_types['continuous']==True) | (detected_types['dirty_float']==True)]
print(f"cat_columns = {cat_columns}")
print(f"num_columns = {num_columns}")

cat_columns = Index(['EDUCATION', 'MARRIAGE'], dtype='object')
num_columns = Index(['LIMIT_BAL', 'BILL_AMT1'], dtype='object')


## Bucketers

### EqualWidthBucketer

In [15]:
from skorecard.bucketers import EqualWidthBucketer

In [16]:
EWB = EqualWidthBucketer(bins = 5) # show non-int

In [17]:
EWB.fit(X['LIMIT_BAL']) # this breaks - needs pandas df

TypeError: The data set should be a pandas dataframe

In [18]:
EWB.fit(X[['LIMIT_BAL']])

EqualWidthBucketer(bins=5, variables=Index(['LIMIT_BAL'], dtype='object'))

In [19]:
EWB.bins

5

In [20]:
EWB.bucketer # Note the equal (well, almost) widths!

SimpleBucketer
	bincount: 5
Results:
	counts: [3285 1783  660  251   21]
	boundaries: [ 10000. 160000. 310000. 460000. 610000. 760000.]

In [21]:
EWB.features_bucket_mapping_

{'LIMIT_BAL': BucketMapping(feature_name='LIMIT_BAL', type='numerical', map=array([ 10000., 160000., 310000., 460000., 610000., 760000.]), missing_bucket=None, right=True)}

In [22]:
X_transform = EWB.transform(X[['LIMIT_BAL']])

In [23]:
X_transform

Unnamed: 0,LIMIT_BAL
0,3
1,1
2,4
3,1
4,3
...,...
5995,2
5996,1
5997,1
5998,1


In [24]:
X_transform['LIMIT_BAL'].value_counts()

1    3414
2    1705
3     624
4     241
5      16
Name: LIMIT_BAL, dtype: int64

### EqualFrequencyBucketer

In [25]:
from skorecard.bucketers import EqualFrequencyBucketer

In [26]:
EFB = EqualFrequencyBucketer(bins = 5)

In [27]:
EFB.fit(X[['BILL_AMT1']])

EqualFrequencyBucketer(bins=5, variables=Index(['BILL_AMT1'], dtype='object'))

In [28]:
EFB.variables

Index(['BILL_AMT1'], dtype='object')

In [29]:
EFB.bucketer # Note the counts

QuantileBucketer
	bincount: 5
Results:
	counts: [1200 1200 1200 1200 1200]
	boundaries: [-165580.     1894.8   14167.6   38386.2   83379.4  610723. ]

In [30]:
X_transform = EFB.transform(X[['BILL_AMT1']])

In [31]:
X_transform.head()

Unnamed: 0,BILL_AMT1
0,5
1,4
2,5
3,1
4,4


In [32]:
X_transform.value_counts()

BILL_AMT1
5            1200
4            1200
3            1200
2            1200
1            1200
dtype: int64

#### Not always perfect: LIMIT_BAL

In [34]:
EFB = EqualFrequencyBucketer(bins = 5)
EFB.fit(X[['LIMIT_BAL']])
EFB.bucketer # Note the counts

QuantileBucketer
	bincount: 5
Results:
	counts: [1525  985 1200 1185 1105]
	boundaries: [ 10000.  50000. 100000. 180000. 280000. 760000.]

### Agglomerative

In [35]:
from skorecard.bucketers import AgglomerativeClusteringBucketer

In [36]:
ACB = AgglomerativeClusteringBucketer(bins = 5)

In [37]:
ACB.fit(X[['BILL_AMT1']])

AgglomerativeClusteringBucketer(bins=5,
                                variables=Index(['BILL_AMT1'], dtype='object'))

In [38]:
ACB.bucketer

AgglomerativeBucketer
	bincount: 5
Results:
	counts: [ 361   88 3478  683 1390]
	boundaries: [-165580.0, 34211.5, 89007.0, 173337.5, 320945.0, 610723.0]

In [39]:
X_transform = ACB.transform(X[['BILL_AMT1']])

In [40]:
X_transform.head()

Unnamed: 0,BILL_AMT1
0,4
1,2
2,5
3,1
4,2


In [41]:
X_transform.value_counts()

BILL_AMT1
1            3478
2            1390
3             683
4             361
5              88
dtype: int64

### Categoricals

In [42]:
from skorecard.bucketers import OrdinalCategoricalBucketer

In [43]:
OCB = OrdinalCategoricalBucketer(bins=3) # this fails

TypeError: __init__() got an unexpected keyword argument 'bins'

In [44]:
X['MARRIAGE'].value_counts(normalize=True)

2    0.523000
1    0.464000
3    0.010667
0    0.002333
Name: MARRIAGE, dtype: float64

In [45]:
OCB = OrdinalCategoricalBucketer(tol=0.15, max_n_categories=None)

In [47]:
OCB.fit_transform(X[['BILL_AMT1']])

Unnamed: 0,BILL_AMT1
0,0
1,0
2,0
3,0
4,0
...,...
5995,0
5996,0
5997,0
5998,0


In [55]:
OCB = OrdinalCategoricalBucketer(tol=0.15, max_n_categories=None)

In [56]:
OCB.fit(X[['MARRIAGE']])

OrdinalCategoricalBucketer(tol=0.15,
                           variables=Index(['MARRIAGE'], dtype='object'))

In [50]:
X_transform = OCB.transform(X[['MARRIAGE']])

In [52]:
X_transform.value_counts(normalize=True)

MARRIAGE
1           0.523
2           0.464
0           0.013
dtype: float64

### And if we increase tol?

In [54]:
OCB = OrdinalCategoricalBucketer(tol=0.50, max_n_categories=None)
OCB.fit(X[['MARRIAGE']])
X_transform = OCB.transform(X[['MARRIAGE']])
X_transform.value_counts(normalize=True)

MARRIAGE
1           0.523
0           0.477
dtype: float64

### DecisionTreeBucketer

In [57]:
from skorecard.bucketers import DecisionTreeBucketer

In [58]:
DTB = DecisionTreeBucketer()

In [59]:
DTB.fit(X[['LIMIT_BAL']], y)

100%|██████████| 62/62 [00:00<00:00, 21445.39it/s]


DecisionTreeBucketer(ccp_alpha=0.0, class_weight=None, criterion='gini',
                     max_depth=None, max_features=None, max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, presort='deprecated',
                     random_state=None, splitter='best')

In [60]:
X_transform = DTB.transform(X[['LIMIT_BAL']])

In [61]:
X_transform['LIMIT_BAL'].value_counts()

5     676
2     396
3     327
8     319
20    302
     ... 
53      3
57      3
56      2
62      2
61      2
Name: LIMIT_BAL, Length: 62, dtype: int64

In [62]:
DTB.bucketer

TreeBucketer
	bincount: 62
Results:
	counts: [ 83 396 327  43 676 168 168 319 113 217 120 156 141 149 209 129 111 185
  46 302 138  98 137 125  81 109  49 100  67 106  51  55  41  44  44 166
  18  37  31  45  24  34  20  19  31  15  16  11  13 163   9   5   3   4
   7   2   3   7   4   6   2   2]
	boundaries: [10000.0, 15000.0, 25000.0, 35000.0, 45000.0, 55000.0, 65000.0, 75000.0, 85000.0, 95000.0, 105000.0, 115000.0, 125000.0, 135000.0, 145000.0, 155000.0, 165000.0, 175000.0, 185000.0, 195000.0, 205000.0, 215000.0, 225000.0, 235000.0, 245000.0, 255000.0, 265000.0, 275000.0, 285000.0, 295000.0, 305000.0, 315000.0, 325000.0, 335000.0, 345000.0, 355000.0, 365000.0, 375000.0, 385000.0, 395000.0, 405000.0, 415000.0, 425000.0, 435000.0, 445000.0, 455000.0, 465000.0, 475000.0, 485000.0, 495000.0, 505000.0, 515000.0, 525000.0, 545000.0, 555000.0, 585000.0, 595000.0, 605000.0, 625000.0, 635000.0, 715000.0, 735000.0, 760000.0]

In [63]:
DTB.features_bucket_mapping_

{'LIMIT_BAL': BucketMapping(feature_name='LIMIT_BAL', type='numerical', map=[10000.0, 15000.0, 25000.0, 35000.0, 45000.0, 55000.0, 65000.0, 75000.0, 85000.0, 95000.0, 105000.0, 115000.0, 125000.0, 135000.0, 145000.0, 155000.0, 165000.0, 175000.0, 185000.0, 195000.0, 205000.0, 215000.0, 225000.0, 235000.0, 245000.0, 255000.0, 265000.0, 275000.0, 285000.0, 295000.0, 305000.0, 315000.0, 325000.0, 335000.0, 345000.0, 355000.0, 365000.0, 375000.0, 385000.0, 395000.0, 405000.0, 415000.0, 425000.0, 435000.0, 445000.0, 455000.0, 465000.0, 475000.0, 485000.0, 495000.0, 505000.0, 515000.0, 525000.0, 545000.0, 555000.0, 585000.0, 595000.0, 605000.0, 625000.0, 635000.0, 715000.0, 735000.0, 760000.0], missing_bucket=None, right=True)}

In [64]:
DTB.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

## Making a pipeline

In [65]:
bucket_pipeline = make_pipeline(
    EqualWidthBucketer(bins=5, variables=list(num_columns)),
    OrdinalCategoricalBucketer(variables=list(cat_columns))
)

In [66]:
bucket_pipeline

Pipeline(steps=[('equalwidthbucketer',
                 EqualWidthBucketer(bins=5,
                                    variables=['LIMIT_BAL', 'BILL_AMT1'])),
                ('ordinalcategoricalbucketer',
                 OrdinalCategoricalBucketer(variables=['EDUCATION',
                                                       'MARRIAGE']))])

In [67]:
bucket_pipeline.named_steps

{'equalwidthbucketer': EqualWidthBucketer(bins=5, variables=['LIMIT_BAL', 'BILL_AMT1']),
 'ordinalcategoricalbucketer': OrdinalCategoricalBucketer(variables=['EDUCATION', 'MARRIAGE'])}

In [68]:
pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

In [69]:
pipeline

Pipeline(steps=[('bucketing',
                 Pipeline(steps=[('equalwidthbucketer',
                                  EqualWidthBucketer(bins=5,
                                                     variables=['LIMIT_BAL',
                                                                'BILL_AMT1'])),
                                 ('ordinalcategoricalbucketer',
                                  OrdinalCategoricalBucketer(variables=['EDUCATION',
                                                                        'MARRIAGE']))])),
                ('one-hot-encoding', OneHotEncoder()),
                ('lr', LogisticRegression())])

In [70]:
pipeline.named_steps

{'bucketing': Pipeline(steps=[('equalwidthbucketer',
                  EqualWidthBucketer(bins=5,
                                     variables=['LIMIT_BAL', 'BILL_AMT1'])),
                 ('ordinalcategoricalbucketer',
                  OrdinalCategoricalBucketer(variables=['EDUCATION',
                                                        'MARRIAGE']))]),
 'one-hot-encoding': OneHotEncoder(),
 'lr': LogisticRegression()}

In [71]:
pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

'AUC = 0.6135'

In [73]:
bucket_pipeline = make_pipeline(
    DecisionTreeBucketer(max_depth=4,min_samples_leaf=0.1, variables=list(num_columns)),
    OrdinalCategoricalBucketer(variables=list(cat_columns))
)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

100%|██████████| 8/8 [00:00<00:00, 10679.32it/s]
100%|██████████| 8/8 [00:00<00:00, 26337.86it/s]


'AUC = 0.6418'

In [75]:
cat_columns

Index(['EDUCATION', 'MARRIAGE'], dtype='object')

In [81]:
# Using a ColumnTransformer


bucket_pipeline = ColumnTransformer([
    ('categorical_preprocessing', OrdinalCategoricalBucketer(),cat_columns),
], remainder="passthrough")

ohe_pipeline = ColumnTransformer([
    ('ohe_cat_preprocessing', OneHotEncoder(),[0,1]),
], remainder="passthrough")

# bucket_pipeline.fit_transform(X, y)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('ohe', ohe_pipeline),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

'AUC = 0.6086'

In [84]:
# Using a ColumnTransformer


bucket_pipeline = ColumnTransformer([
    ('categorical_preprocessing', OrdinalCategoricalBucketer(),cat_columns),
    ('numerical_preprocessing', DecisionTreeBucketer(max_depth=4,min_samples_leaf=0.1), ['LIMIT_BAL','BILL_AMT1'])
], remainder="passthrough")

ohe_pipeline = ColumnTransformer([
    ('ohe_cat_preprocessing', OneHotEncoder(),[0,1]),
], remainder="passthrough")

# bucket_pipeline.fit_transform(X, y)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('ohe', ohe_pipeline),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

100%|██████████| 8/8 [00:00<00:00, 19553.86it/s]
100%|██████████| 8/8 [00:00<00:00, 23399.19it/s]


'AUC = 0.6212'