In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [3]:
from weight_of_evidence import weight_of_evidence

In [4]:
single_var_decision_tree= weight_of_evidence.SingleVariableDecisionTreeClassifier(
        min_samples_per_node=1, max_depth=2, min_gini_decrease=1e-3
    )

### Gini

In [None]:
y_1 = np.array([1, 5, 20, 10, 0])
y_c = np.array([2, 20, 50, 10, 10])

In [None]:
single_var_decision_tree._gini(y_1, y_c)

In [None]:
1.0 - (0.4**2) - (0.6**2)

### Gini Decrease

In [None]:
Y = pd.Series([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,])

In [None]:
gini_decreases, _, _ = single_var_decision_tree._find_gini_decreases(Y)
    

In [None]:
gini_decreases

### best split

In [None]:
X_SORTED = pd.Series([1, 1, 2, 2, 3, 3, 3, 3, 3, 5, 10, 20,])

In [None]:
single_var_decision_tree._best_split(X_SORTED, Y)

### Sklearn transformers example

In [5]:
def non_mono_fn(company_age):
    if company_age <= 25:
        return 0.25
    elif company_age <= 75:
        return 0.75
    else:
        return 0.25

In [6]:
company_age = np.arange(1, 100, 0.001)

In [7]:
non_mono_risk = np.vectorize(non_mono_fn)(company_age)

In [8]:
def simulate_outcome(risk_vector):
    random_draws = np.random.uniform(size=risk_vector.shape)
    return (risk_vector > random_draws).astype(int)

In [9]:
Y_non_mono = simulate_outcome(non_mono_risk)

In [10]:
company_age

array([ 1.   ,  1.001,  1.002, ..., 99.997, 99.998, 99.999])

In [11]:
Y_non_mono

array([1, 0, 1, ..., 0, 1, 0])

In [12]:
tree_binner = weight_of_evidence.TreeBinner( min_samples_per_node=1, max_depth=2, min_gini_decrease=1e-3)

In [13]:
X_non_mono = pd.DataFrame(data = company_age,columns=['company_age'])

In [14]:
X_non_mono

Unnamed: 0,company_age
0,1.000
1,1.001
2,1.002
3,1.003
4,1.004
...,...
98995,99.995
98996,99.996
98997,99.997
98998,99.998


In [15]:
Y = pd.Series(Y_non_mono)

In [16]:
Y

0        1
1        0
2        1
3        0
4        1
        ..
98995    1
98996    0
98997    0
98998    1
98999    0
Length: 99000, dtype: int64

In [17]:
tree_binner.fit(X_non_mono,Y)

TreeBinner(max_depth=None, min_gini_decrease=None, min_samples_per_node=None)

In [18]:
tree_binner.splits_

{'company_age': [-inf, 25.001999999997356, 75.00099999999185, inf]}

In [19]:
X_binned = tree_binner.transform(X_non_mono)

In [20]:
X_binned

Unnamed: 0,company_age
0,"(-inf, 25.001999999997356]"
1,"(-inf, 25.001999999997356]"
2,"(-inf, 25.001999999997356]"
3,"(-inf, 25.001999999997356]"
4,"(-inf, 25.001999999997356]"
...,...
98995,"(75.00099999999185, inf]"
98996,"(75.00099999999185, inf]"
98997,"(75.00099999999185, inf]"
98998,"(75.00099999999185, inf]"


In [21]:
logit_scaler = weight_of_evidence.LogitScaler()

In [22]:
logit_scaler.fit(X_binned,Y)

LogitScaler(clip_thresh=100000.0)

In [23]:
logit_scaler.logit_values_

{'company_age': {'(-inf, 25.001999999997356]': -1.1085803511573984,
  '(25.001999999997356, 75.00099999999185]': 1.1015745421452388,
  '(75.00099999999185, inf]': -1.0891406060874975}}

In [24]:
X_logit_scaled = logit_scaler.transform(X_binned)

In [25]:
X_logit_scaled

Unnamed: 0,company_age
0,-1.108580
1,-1.108580
2,-1.108580
3,-1.108580
4,-1.108580
...,...
98995,-1.089141
98996,-1.089141
98997,-1.089141
98998,-1.089141


In [26]:
standard_scaler = StandardScaler()

In [27]:
standard_scaler.fit(X_logit_scaled)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [28]:
X_standard_scaled = standard_scaler.transform(X_logit_scaled)

In [29]:
X_standard_scaled

array([[-1.01912761],
       [-1.01912761],
       [-1.01912761],
       ...,
       [-1.00145647],
       [-1.00145647],
       [-1.00145647]])

In [30]:
log_reg = LogisticRegression(solver='lbfgs' )

In [31]:
log_reg.fit(X_standard_scaled, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
woe_bin_regression = Pipeline((('tree_bin',weight_of_evidence.TreeBinner(
 min_samples_per_node=1, max_depth=2, min_gini_decrease=1e-3)),
                               ('logit_scale',weight_of_evidence.LogitScaler()),
                               ('standard_scale',StandardScaler()),
                               ('log_reg',LogisticRegression(solver='lbfgs'))))

In [33]:
woe_bin_regression.fit(X_non_mono, Y)

Pipeline(memory=None,
         steps=[('tree_bin',
                 TreeBinner(max_depth=None, min_gini_decrease=None,
                            min_samples_per_node=None)),
                ('logit_scale', LogitScaler(clip_thresh=100000.0)),
                ('standard_scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('log_reg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [34]:
woe_bin_regression['log_reg'].coef_

array([[1.10002486]])

In [35]:
woe_bin_regression.predict_proba(X_non_mono)

array([[0.75185308, 0.24814692],
       [0.75185308, 0.24814692],
       [0.75185308, 0.24814692],
       ...,
       [0.74820868, 0.25179132],
       [0.74820868, 0.25179132],
       [0.74820868, 0.25179132]])