In [1]:
%load_ext autoreload
%autoreload 2

from skorecard import datasets
from dabl import detect_types

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

from skorecard.bucketers import DecisionTreeBucketer, EqualWidthBucketer, OrdinalCategoricalBucketer

# from skorecard.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression

# from feature_engine.discretisers import EqualWidthDiscretiser

import pandas as pd
import numpy as np

In [2]:
X, y = datasets.load_uci_credit_card(return_X_y=True)

In [3]:
# Specify categorical and numerical columns
# We'll autodetect using dabl.detect_types
detected_types = detect_types(X)
cat_columns = X.columns[(detected_types['categorical']==True) | (detected_types['low_card_int']==True)]
num_columns = X.columns[(detected_types['continuous']==True) | (detected_types['dirty_float']==True)]
print(f"cat_columns = {cat_columns}")
print(f"num_columns = {num_columns}")

cat_columns = Index(['EDUCATION', 'MARRIAGE'], dtype='object')
num_columns = Index(['LIMIT_BAL', 'BILL_AMT1'], dtype='object')


In [4]:
# using make pipeline

bucket_pipeline = make_pipeline(
    EqualWidthBucketer(bins=5, variables=list(num_columns)),
    OrdinalCategoricalBucketer(variables=list(cat_columns))
)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"


'AUC = 0.6135'

In [5]:
# Using a ColumnTransformer

bucket_pipeline = ColumnTransformer([
    ('categorical_preprocessing', OrdinalCategoricalBucketer(), ['EDUCATION', 'MARRIAGE']),
    ('numerical_preprocessing', EqualWidthBucketer(bins=5), ['LIMIT_BAL','BILL_AMT1'])
], remainder="passthrough")

# bucket_pipeline.fit_transform(X, y)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

'AUC = 0.6135'

In [6]:
X_transform = bucket_pipeline.named_transformers_['categorical_preprocessing'].transform(X)

In [7]:
X_transform

Unnamed: 0,EDUCATION,MARRIAGE,LIMIT_BAL,BILL_AMT1
0,1,1,400000.0,201800.0
1,3,1,80000.0,80610.0
2,1,1,500000.0,499452.0
3,1,2,140000.0,450.0
4,3,2,420000.0,56107.0
...,...,...,...,...
5995,3,2,170000.0,114007.0
5996,3,2,160000.0,76445.0
5997,1,2,160000.0,79244.0
5998,3,2,100000.0,0.0


In [8]:
bucket_pipeline.named_transformers_['categorical_preprocessing']

OrdinalCategoricalBucketer(variables=Index(['EDUCATION', 'MARRIAGE'], dtype='object'))

In [9]:
X['EDUCATION'].value_counts().keys()

Int64Index([2, 1, 3, 5, 4, 6, 0], dtype='int64')

In [10]:
tmp = pd.DataFrame({'BUCKET': X_transform['EDUCATION'].value_counts().keys(),
                    'NUMBER_IN_BUCKET': X_transform['EDUCATION'].value_counts().values,
                    'PERCENTAGE_IN_BUCKET': X_transform['EDUCATION'].value_counts(normalize=True).values})

In [11]:
tmp

Unnamed: 0,BUCKET,NUMBER_IN_BUCKET,PERCENTAGE_IN_BUCKET
0,3,2725,0.454167
1,1,2186,0.364333
2,2,1013,0.168833
3,0,76,0.012667


In [12]:
X_transform['target'] = y

In [13]:
df = X_transform.groupby(['EDUCATION'])['target'].sum().reset_index().rename(columns={'EDUCATION': 'BUCKET',
                                                                       'target': 'BADS'})

In [14]:
tmp = tmp.merge(df, how='left', on='BUCKET')

In [15]:
tmp['DEFAULT_RATE'] = tmp['BADS'] / tmp['NUMBER_IN_BUCKET'] # can we divide by 0?

In [16]:
tmp

Unnamed: 0,BUCKET,NUMBER_IN_BUCKET,PERCENTAGE_IN_BUCKET,BADS,DEFAULT_RATE
0,3,2725,0.454167,688,0.252477
1,1,2186,0.364333,397,0.18161
2,2,1013,0.168833,255,0.251728
3,0,76,0.012667,5,0.065789


In [17]:
X_transform['EDUCATION_ORIGINAL'] = X['EDUCATION']

In [21]:
df = X_transform.groupby(['EDUCATION'])['EDUCATION_ORIGINAL'].min().reset_index().rename(columns={'EDUCATION': 'BUCKET',
                                                                       'EDUCATION_ORIGINAL': 'MIN'})

In [22]:
df

Unnamed: 0,BUCKET,MIN
0,0,0
1,1,1
2,2,3
3,3,2


In [23]:
tmp = tmp.merge(df, how='left', on='BUCKET')

In [24]:
tmp

Unnamed: 0,BUCKET,NUMBER_IN_BUCKET,PERCENTAGE_IN_BUCKET,BADS,DEFAULT_RATE,MIN
0,3,2725,0.454167,688,0.252477,2
1,1,2186,0.364333,397,0.18161,1
2,2,1013,0.168833,255,0.251728,3
3,0,76,0.012667,5,0.065789,0


In [25]:
df = X_transform.groupby(['EDUCATION'])['EDUCATION_ORIGINAL'].max().reset_index().rename(columns={'EDUCATION': 'BUCKET',
                                                                       'EDUCATION_ORIGINAL': 'MAX'})

In [26]:
tmp = tmp.merge(df, how='left', on='BUCKET')

In [27]:
tmp

Unnamed: 0,BUCKET,NUMBER_IN_BUCKET,PERCENTAGE_IN_BUCKET,BADS,DEFAULT_RATE,MIN,MAX
0,3,2725,0.454167,688,0.252477,2,2
1,1,2186,0.364333,397,0.18161,1,1
2,2,1013,0.168833,255,0.251728,3,3
3,0,76,0.012667,5,0.065789,0,6


In [28]:
df = X_transform.groupby(['EDUCATION'])['EDUCATION_ORIGINAL'].mean().reset_index().rename(columns={'EDUCATION': 'BUCKET',
                                                                       'EDUCATION_ORIGINAL': 'MEAN'})

In [29]:
tmp = tmp.merge(df, how='left', on='BUCKET')

In [31]:
tmp

Unnamed: 0,BUCKET,NUMBER_IN_BUCKET,PERCENTAGE_IN_BUCKET,BADS,DEFAULT_RATE,MIN,MAX,MEAN
0,3,2725,0.454167,688,0.252477,2,2,2.0
1,1,2186,0.364333,397,0.18161,1,1,1.0
2,2,1013,0.168833,255,0.251728,3,3,3.0
3,0,76,0.012667,5,0.065789,0,6,4.907895


In [None]:
bucket_pipeline.named_transformers_['categorical_preprocessing']

In [None]:
def create_df(X, y, bucketer):
    X = X.copy()
    X_transform = bucketer.transform(X)
    df = pd.DataFrame({'BUCKET': X_transform['EDUCATION'].value_counts().keys(),
                    'NUMBER_IN_BUCKET': X_transform['EDUCATION'].value_counts().values,
                    'PERCENTAGE_IN_BUCKET': X_transform['EDUCATION'].value_counts(normalize=True).values})
    X_transform['target'] = y
    
    # Default Rates
    tmp = (X_transform.groupby(['EDUCATION'])['target']
           .sum()
           .reset_index()
           .rename(columns={'EDUCATION': 'BUCKET',
                            'target': 'BADS'}))
    
    