In [None]:
%load_ext autoreload
%autoreload 2

from skorecard import datasets

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score


# from skorecard.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression

# from feature_engine.discretisers import EqualWidthDiscretiser

import pandas as pd
import numpy as np

In [None]:
from skorecard import datasets
from skorecard.bucketers import DecisionTreeBucketer, OptimalBucketer
from skorecard.pipeline import BucketingPipeline, tweak_buckets
from sklearn.pipeline import make_pipeline
df = datasets.load_uci_credit_card(as_frame=True)

### Load Data

in `skorecard` there is a demo dataset with 4 features (2 categorical and 2 numerical) for demo and testing.<br>
We'll use this one here

In [None]:
X = df.drop(columns=["default"])
y = df["default"]
num_cols = ["LIMIT_BAL", "BILL_AMT1"]
cat_cols = ["EDUCATION", "MARRIAGE"]

In [None]:
X[num_cols+cat_cols], y

### Quick intro to scikit-learn

### sklearn transformers

- `transfromers` are classes in sklearn whose function is to perform a transformation on the data.<br>
- in general, a `transformer` preserves the number of rows in a dataset.<br>
- 'transformers` are characterized by two main functions:
    - `fit(X,y=None)` performs the necessar calculations
    - `transfrom(X,y=None)` applies the transformation to the (new) dataset
    
Example: `MinMaxScaler`: this is a transformer that changes the range of the input features X to a predifined range (normally -1 to 1 or 0,1), depending on the use case

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
mms = MinMaxScaler(feature_range=(0, 1)).fit(X)
X_transformed = mms.transform(X)
X_transformed

In [None]:
X_transformed[:,0].min(), X_transformed[:,0].max()

In [None]:
mms = MinMaxScaler(feature_range=(-2, 2)).fit(X)
X_transformed = mms.transform(X)
X_transformed[:,0].min(), X_transformed[:,0].max()

## sklearn models

- models are classes that contain the (ML) models and all that comes along.
- A model has three main functions:
    - fit(X,y) - runs the optimization for the specific algorithms
    - predict(X) - returns the predictions for a new dataset
    - predict_proba(X) - returns the probabilities of the fitted model
    
Example: `Logistic Regression`

In [None]:
from sklearn.linear_model import LogisticRegression

lr = (
    LogisticRegression()
    .fit(X,y)
)
X_proba = lr.predict_proba(X)
X_proba

## sklearn pipeline - putting it all togeteher

A pipeline is a sequential set that puts together transformers and one model.<br>
The pipeline can have a sequence of multiple transformers and must finish with a model.

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline

pipe = make_pipeline(
    OptimalBucketer(variables=num_cols, max_n_bins=10, min_bin_size=0.05),
   LogisticRegression()
)

pipe.fit(X,y)

In [None]:
X_proba = pipe.predict_proba(X)
X_proba

## Skorecard - and how it fits in the sklrean API

When we consider the bucketing process, it fits in the concept of sklearn transformers.<br>
Therefore in skorecard, we implemented a set of transformers that map the input data to a set of buckets.

Example: bucket with Decision Trees

In [None]:
skorecard_pipeline = make_pipeline(
    DecisionTreeBucketer(variables=num_cols, max_n_bins=6, min_bin_size=0.1),
    LogisticRegression()
)

In [None]:
skorecard_pipeline.fit(X,y)

#### Get the details of the bucketers

In [None]:
binner = skorecard_pipeline.steps[0][1] # get the first element of the pipeline, which is our bucketer

In [None]:
binner.features_bucket_mapping_['LIMIT_BAL']

In [None]:
from skorecard.reporting import create_report

In [None]:
create_report(X,y,num_cols[0],binner, verbose = True)

### Automatic bucketing is nice, but probably some manual tweakins is necessary.

In [None]:


prebucket_pipeline = make_pipeline(DecisionTreeBucketer(variables=num_cols, max_n_bins=100, min_bin_size=0.05))
bucket_pipeline = BucketingPipeline(make_pipeline(
    OptimalBucketer(variables=num_cols, max_n_bins=10, min_bin_size=0.05),
    OptimalBucketer(variables=cat_cols, max_n_bins=10, min_bin_size=0.05),
))
pipe = make_pipeline(prebucket_pipeline, bucket_pipeline)
pipe.fit(X, y)


In [None]:
tweak_buckets(pipe, X, y)