In [1]:
%load_ext autoreload
%autoreload 2

from skorecard import datasets
from dabl import detect_types

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

from skorecard.bucketers import DecisionTreeBucketer, EqualWidthBucketer, OrdinalCategoricalBucketer

# from skorecard.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression

# from feature_engine.discretisers import EqualWidthDiscretiser

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
X, y = datasets.load_uci_credit_card(return_X_y=True)

In [3]:
# Specify categorical and numerical columns
# We'll autodetect using dabl.detect_types
detected_types = detect_types(X)
cat_columns = X.columns[(detected_types['categorical']==True) | (detected_types['low_card_int']==True)]
num_columns = X.columns[(detected_types['continuous']==True) | (detected_types['dirty_float']==True)]
print(f"cat_columns = {cat_columns}")
print(f"num_columns = {num_columns}")

cat_columns = Index(['EDUCATION', 'MARRIAGE'], dtype='object')
num_columns = Index(['LIMIT_BAL', 'BILL_AMT1'], dtype='object')


In [5]:
# using make pipeline

bucket_pipeline = make_pipeline(
    EqualWidthBucketer(bins=5, variables=list(num_columns)),
    OrdinalCategoricalBucketer(variables=list(cat_columns))
)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"


'AUC = 0.6135'

In [7]:
# Using a ColumnTransformer

bucket_pipeline = ColumnTransformer([
    ('categorical_preprocessing', OrdinalCategoricalBucketer(), ['EDUCATION', 'MARRIAGE']),
    ('numerical_preprocessing', EqualWidthBucketer(bins=5), ['LIMIT_BAL','BILL_AMT1'])
], remainder="passthrough")

# bucket_pipeline.fit_transform(X, y)

pipeline = Pipeline([
    ('bucketing', bucket_pipeline),
    ('one-hot-encoding', OneHotEncoder()),
    ('lr', LogisticRegression())
])

pipeline.fit(X, y)
f"AUC = {roc_auc_score(y, pipeline.predict_proba(X)[:,1]):.4f}"

'AUC = 0.6135'