In [30]:
%load_ext autoreload
%autoreload 2

from skorecard import datasets
from dabl import detect_types

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

from skorecard.bucketers import DecisionTreeBucketer, EqualWidthBucketer, OrdinalCategoricalBucketer

# from skorecard.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression

# from feature_engine.discretisers import EqualWidthDiscretiser

import pandas as pd
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
X, y = datasets.load_uci_credit_card(return_X_y=True)

In [18]:
# Specify categorical and numerical columns
# We'll autodetect using dabl.detect_types
detected_types = detect_types(X)
cat_columns = X.columns[(detected_types['categorical']==True) | (detected_types['low_card_int']==True)]
num_columns = X.columns[(detected_types['continuous']==True) | (detected_types['dirty_float']==True)]
print(f"cat_columns = {cat_columns}")
print(f"num_columns = {num_columns}")

cat_columns = Index(['EDUCATION', 'MARRIAGE'], dtype='object')
num_columns = Index(['LIMIT_BAL', 'BILL_AMT1'], dtype='object')


In [19]:
# using make pipeline

bucket_pipeline = make_pipeline(
    EqualWidthBucketer(bins=5, variables=list(num_columns)),
    OrdinalCategoricalBucketer(variables=list(cat_columns))
)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"


'AUC = 0.6135'

In [20]:
# Using a ColumnTransformer

bucket_pipeline = ColumnTransformer([
    ('categorical_preprocessing', OrdinalCategoricalBucketer(), ['EDUCATION', 'MARRIAGE']),
    ('numerical_preprocessing', EqualWidthBucketer(bins=5), ['LIMIT_BAL','BILL_AMT1'])
], remainder="passthrough")

# bucket_pipeline.fit_transform(X, y)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

'AUC = 0.6135'

In [69]:
X_transform = bucket_pipeline.named_transformers_['categorical_preprocessing'].transform(X)

In [70]:
X_transform

Unnamed: 0,EDUCATION,MARRIAGE,LIMIT_BAL,BILL_AMT1
0,1,1,400000.0,201800.0
1,3,1,80000.0,80610.0
2,1,1,500000.0,499452.0
3,1,2,140000.0,450.0
4,3,2,420000.0,56107.0
...,...,...,...,...
5995,3,2,170000.0,114007.0
5996,3,2,160000.0,76445.0
5997,1,2,160000.0,79244.0
5998,3,2,100000.0,0.0


In [24]:
bucket_pipeline.named_transformers_['categorical_preprocessing']

'EDUCATION'

In [46]:
X['EDUCATION'].value_counts().keys()

Int64Index([2, 1, 3, 5, 4, 6, 0], dtype='int64')

In [71]:
tmp = pd.DataFrame({'BUCKET': X_transform['EDUCATION'].value_counts().keys(),
                    'NUMBER_IN_BUCKET': X_transform['EDUCATION'].value_counts().values,
                    'PERCENTAGE_IN_BUCKET': X_transform['EDUCATION'].value_counts(normalize=True).values})

In [72]:
tmp

Unnamed: 0,BUCKET,NUMBER_IN_BUCKET,PERCENTAGE_IN_BUCKET
0,3,2725,0.454167
1,1,2186,0.364333
2,2,1013,0.168833
3,0,76,0.012667


In [73]:
X_transform['target'] = y

In [74]:
df = X_transform.groupby(['EDUCATION'])['target'].sum().reset_index().rename(columns={'EDUCATION': 'BUCKET',
                                                                       'target': 'BADS'})

In [75]:
tmp = tmp.merge(df, how='left', on='BUCKET')

In [76]:
tmp['DEFAULT_RATE'] = tmp['BADS'] / tmp['NUMBER_IN_BUCKET'] # can we divide by 0?

In [77]:
tmp

Unnamed: 0,BUCKET,NUMBER_IN_BUCKET,PERCENTAGE_IN_BUCKET,BADS,DEFAULT_RATE
0,3,2725,0.454167,688,0.252477
1,1,2186,0.364333,397,0.18161
2,2,1013,0.168833,255,0.251728
3,0,76,0.012667,5,0.065789


In [78]:
X_transform['EDUCATION_ORIGINAL'] = X['EDUCATION']

In [80]:
X_transform.groupby(['EDUCATION'])['EDUCATION_ORIGINAL'].min().reset_index()

Unnamed: 0,EDUCATION,EDUCATION_ORIGINAL
0,0,0
1,1,1
2,2,3
3,3,2
