### Minimal Example

In [48]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys
sys.path.append('../arne/')  # Necessary to import aughs from parent directory

from sklearn.tree import DecisionTreeRegressor
from copy import deepcopy
from imodels.util.data_util import get_clean_dataset
from imodels import HSTreeRegressor

import numpy as np
from aughs import ShrinkageClassifier, ShrinkageRegressor
from sklearn.model_selection import cross_val_score, cross_validate
import matplotlib.pyplot as plt
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


For this simple example I copied the simulation code from the shrinkage paper repository:

In [31]:
def sim_data(n=50, std=1, reg_param=0, shrinkage_scheme_ = 'node_based', show=True, linear_data=False, return_curves=False):

    if linear_data:
        def gt_func(X):
            return X
    else:
        def gt_func(X):
            return +1 * (X < 2) * (X < 1) + \
                   -0 * (X < 2) * (X > 1) + \
                   +1 * (X >= 2) * (X < 3) + \
                   +0 * (X >= 2) * (X > 3)

    # data to fit
    X = np.random.uniform(0, 4, n)
    X = np.sort(X)
    y = gt_func(X) + np.random.normal(0, 1, n) * std

    return X, y #, X_tile, y_tile, y_pred_dt, y_pred_shrunk


In [32]:
#X, y, feature_names = get_clean_dataset("juvenile_clean", data_source= "imodels")

X, y = sim_data()
X = X.reshape(-1, 1)
X.shape

#y.reshape(-1, 1) #shape

(50, 1)

#### Bias in the predictions from our aughs code ?

For a simple tree (no bootstrapping, no forest)
I wanted to make sure that the average of the predictions is always equal to the mean response in the training data and also independent of lambda.
That seems to be the case for the imodels function:

In [46]:
#The imodels way:
shrinkage_scheme_ = 'node_based'
lmbs = np.arange(0, 31, 1)**2
lmbs
yHatAvg1 = np.zeros(len(lmbs))       

for i, lmb in enumerate(lmbs):
    m1 = DecisionTreeRegressor(random_state=1)  #, max_leaf_nodes=15)
    m1.fit(X, y)
    mshrunk = HSTreeRegressor(deepcopy(m1), reg_param=lmb, shrinkage_scheme_=shrinkage_scheme_)
    y_pred_shrunk = mshrunk.predict(X)
    yHatAvg1[i] =  np.mean(y_pred_shrunk)

np.round(yHatAvg1,4)

array([0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277,
       0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277,
       0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277,
       0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277, 0.4277])

The `ShrinkageRegressor` seems to introduce a small, lambda dependent bias:?

In [49]:
lmbs = np.arange(0, 31, 1)**2
lmbs
yHatAvg = np.zeros(len(lmbs))

for i, lmb in enumerate(lmbs):
    reg=ShrinkageRegressor(shrink_mode="hs", lmb=lmb)
    reg.fit(X,y)
    yHat = reg.predict(X)
    yHatAvg[i] =  np.mean(yHat)

In [50]:
np.round(yHatAvg,4)

array([0.4277, 0.4226, 0.4234, 0.4357, 0.4468, 0.4513, 0.4498, 0.4452,
       0.4396, 0.4345, 0.4303, 0.427 , 0.4246, 0.4229, 0.4218, 0.4211,
       0.4207, 0.4205, 0.4205, 0.4206, 0.4208, 0.421 , 0.4212, 0.4215,
       0.4217, 0.422 , 0.4222, 0.4225, 0.4227, 0.4229, 0.4231])

And another angle of the same issue:

In [33]:
clf0 = ShrinkageRegressor(shrink_mode="hs", lmb=0)
clf0.fit(X,y)

clf1 = ShrinkageRegressor(shrink_mode="hs", lmb=10)
clf1.fit(X,y)

clf2 = ShrinkageRegressor(shrink_mode="hs_entropy", lmb=10)
clf2.fit(X,y)

#### Unbiased Mean Prediction ?

In [34]:
yHat0 = clf0.predict(X)
yHat1 = clf1.predict(X)
yHat2 = clf2.predict(X)

#yHat1

In [35]:
y_avg = round(np.mean(y),4)
yHat0_avg = round(np.mean(yHat0),4)
yHat1_avg = round(np.mean(yHat1),4)
yHat2_avg = round(np.mean(yHat2),4)

print(y_avg, yHat0_avg, yHat1_avg, yHat2_avg)

0.4277 0.4277 0.4378 0.445


### Ridge Regression
