# Examples

In [90]:
%load_ext lab_black

This notebook contains examples using the 'weight of evidence' transformer.



In [91]:
import pandas as pd
import numpy as np

In [92]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [93]:
from weight_of_evidence import weight_of_evidence

Splits are calculated using single-variable-decision trees

In [94]:
single_var_decision_tree = weight_of_evidence.SingleVariableDecisionTreeClassifier(
    min_samples_per_node=1, max_depth=2, min_gini_decrease=1e-3
)

### Gini Decrease

We can find how much Gini impurity decreases by splitting our variable at various points.

Here we would expect largest Gini increase at some point from 5-6, when dataset switches from mostly 0s to mostly 1s

In [95]:
Y = pd.Series([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,])

In [96]:
gini_decreases, _, _ = single_var_decision_tree._find_gini_decreases(Y)

In [97]:
gini_decreases

0          NaN
1     0.061869
2     0.002778
3     0.041667
4     0.111111
5     0.209921
6     0.125000
7     0.067063
8     0.027778
9     0.004630
10    0.069444
11    0.031566
dtype: float64

### best split

We need to split at a point where X value changes. Here we find the largest Gini decrease when dataset changes to 3 (largest Gini decrease, & change in X value)

In [98]:
X_SORTED = pd.Series([1, 1, 2, 2, 3, 3, 3, 3, 3, 5, 10, 20,])

In [99]:
single_var_decision_tree._best_split(X_SORTED, Y)

3

### Sklearn transformers example

We want to encapsulate this in a scikitlearn transformer, to allow us to combine with logistic regression & weight of evidence transformer, and to allow us to be careful to avoid information leakage by binning on entire dataset

In [100]:
def non_mono_fn(company_age):
    """
    Example - company age has non-linear impact on risk
    """
    if company_age <= 25:
        return 0.25
    elif company_age <= 75:
        return 0.75
    else:
        return 0.25

In [101]:
company_age = np.arange(1, 100, 0.001)

In [102]:
non_mono_risk = np.vectorize(non_mono_fn)(company_age)

In [103]:
def simulate_outcome(risk_vector):
    random_draws = np.random.uniform(size=risk_vector.shape)
    return (risk_vector > random_draws).astype(int)

In [104]:
Y_non_mono = simulate_outcome(non_mono_risk)

In [105]:
company_age

array([ 1.   ,  1.001,  1.002, ..., 99.997, 99.998, 99.999])

In [106]:
Y_non_mono

array([0, 0, 0, ..., 0, 0, 0])

This 'TreeBinner' class is a scikit-learn transformer, with fit & transform methods

In [107]:
tree_binner = weight_of_evidence.TreeBinner(
    min_samples_per_node=1, max_depth=2, min_gini_decrease=1e-3
)

In [108]:
X_non_mono = pd.DataFrame(data=company_age, columns=["company_age"])

In [109]:
X_non_mono

Unnamed: 0,company_age
0,1.000
1,1.001
2,1.002
3,1.003
4,1.004
...,...
98995,99.995
98996,99.996
98997,99.997
98998,99.998


In [110]:
Y = pd.Series(Y_non_mono)

In [111]:
tree_binner.fit(X_non_mono, Y)

TreeBinner(max_depth=2, min_gini_decrease=0.001, min_samples_per_node=1)

In [112]:
tree_binner.splits_

{'company_age': [-inf, 25.000999999997358, 74.99899999999185, inf]}

In [113]:
X_binned = tree_binner.transform(X_non_mono)

In [114]:
X_binned

Unnamed: 0,company_age
0,"(-inf, 25.000999999997358]"
1,"(-inf, 25.000999999997358]"
2,"(-inf, 25.000999999997358]"
3,"(-inf, 25.000999999997358]"
4,"(-inf, 25.000999999997358]"
...,...
98995,"(74.99899999999185, inf]"
98996,"(74.99899999999185, inf]"
98997,"(74.99899999999185, inf]"
98998,"(74.99899999999185, inf]"


This scales binned values according to the log-odds of their bin's average value

In [115]:
logit_scaler = weight_of_evidence.LogitScaler()

In [116]:
logit_scaler.fit(X_binned, Y)

LogitScaler()

In [117]:
logit_scaler.logit_values_

{'company_age': {'(-inf, 25.000999999997358]': -1.1136677754531876,
  '(25.000999999997358, 74.99899999999185]': 1.1060397186830113,
  '(74.99899999999185, inf]': -1.1024559833426726}}

In [118]:
X_logit_scaled = logit_scaler.transform(X_binned)

In [119]:
X_logit_scaled

Unnamed: 0,company_age
0,-1.113668
1,-1.113668
2,-1.113668
3,-1.113668
4,-1.113668
...,...
98995,-1.102456
98996,-1.102456
98997,-1.102456
98998,-1.102456


This encapsulates tree binning, logit-scaling, and standard scaling in single pipeline

In [120]:
woe_bin_regression = Pipeline(
    (
        (
            "tree_bin",
            weight_of_evidence.TreeBinner(
                min_samples_per_node=1, max_depth=2, min_gini_decrease=1e-3
            ),
        ),
        ("logit_scale", weight_of_evidence.LogitScaler()),
        ("standard_scale", StandardScaler()),
        ("log_reg", LogisticRegression(solver="lbfgs")),
    )
)

In [121]:
woe_bin_regression.fit(X_non_mono, Y)

Pipeline(steps=[('tree_bin',
                 TreeBinner(max_depth=2, min_gini_decrease=0.001,
                            min_samples_per_node=1)),
                ('logit_scale', LogitScaler()),
                ('standard_scale', StandardScaler()),
                ('log_reg', LogisticRegression())])

In [122]:
woe_bin_regression["log_reg"].coef_

array([[1.10688483]])

In [123]:
woe_bin_regression.predict_proba(X_non_mono)

array([[0.75280095, 0.24719905],
       [0.75280095, 0.24719905],
       [0.75280095, 0.24719905],
       ...,
       [0.75070873, 0.24929127],
       [0.75070873, 0.24929127],
       [0.75070873, 0.24929127]])