In [23]:
from scipy.spatial import ConvexHull, Delaunay
from scipy.stats import dirichlet

import pandas as pd
import numpy as np
from numpy.linalg import det



from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRFRegressor

In [4]:
def get_ConvexHull(X):
    """Returns points that form convex hull of X"""
    
    hull = ConvexHull(X)
    
    return X[hull.vertices, :]
    

In [5]:
def sampleConvexHull(X):
    """Returns sample from convex hull by taking convex combination points X
    Input:
        X ~ vertices of convex hull (hypothetically)
        
    Ouput:
        x_hat : convex combination of uniformly sampled weights
    
    """
    
    d = X.shape[0]
    wghts = np.random.rand(d)
    wghts /= wghts.sum()
    
    return wghts.dot(X)

In [6]:
def kernelRFdist(X, xstar, rf):
    """For new data point get kernel distance with each training data point from a random forrest"""
    
    n = X.shape[0]
    kernel_rf = np.zeros(n)
    for tree in rf.estimators_:
        
        leaves_tr = tree.apply(X)
        leaf_str = tree.apply(xstar.reshape(1, -1))
        kernel = 1*(leaf_str == leaves_tr[None,:])
        kernel_rf = kernel_rf + kernel
        
    return kernel_rf/rf.n_estimators

In [7]:
def boostrap_subset(X, M, size=1000):
    """Returns boostrap of X with subset of columns of size M"""
    
    n, p = X.shape
    b_ids = np.random.choice(np.arange(n), size=size)
    
    p_ids = np.random.choice(np.arange(p), size=M, replace=False)
    
    return X[b_ids, p_ids]

In [8]:
def barf(X, y, mod, use_hull = False, B=1000, ntrees=100):
    """Takes blackbox model fits f(Z) to Zs sampled from convex hull of X and returns a random forrest fitted to
       Zs and f(z)
    """
    
    # Fit blackblox estimator 
    mod.fit(X,y)
    
    # Sample X from convex hull
    if use_hull:x
        if X.shape[1] > 7:
            X = X[:, np.random.choice(np.arange(X.shape[1]), replace=False, size=7)].copy()
        
        
        hull = get_ConvexHull(X)
        Z = np.r_[[sampleConvexHull(hull) for b in range(B)]]
        
    else:
        Z = np.r_[[sampleConvexHull(X) for b in range(B)]]
        
    
    # Get predictions from convex hull
    fhat = mod.predict(Z)
    
    # fit random forrest,
    rf = RandomForestRegressor(n_estimators=ntrees)
    rf.fit(Z, fhat)
    
    return rf

In [9]:
def local_linear(X,y, xstar, kern):
    """Fits local linear model with kernl"""
    
    n,p = X.shape
    beta = np.zeros(p)
    gamma_n = np.zeros((p,p))
    sigma_n = np.zeros(p)
    for i in range(n):
        gamma_n = gamma_n + (X[i,:][:,None] @ X[i,:][None,:] )* kern[i]
        sigma_n = sigma_n +  X[i, :]*y[i] * kern[i]
    
    beta = np.linalg.inv(gamma_n) @ sigma_n
    
    return xstar.dot(beta)

In [18]:
def barfKernel_reg(X, y, Xstar,  mod, use_hull = False, B=1000, ntrees=100):
    """Fits local linear model using bourne again regression tree kernel"""
    
    
    rf = barf(X, y, mod, use_hull, B, ntrees)
    preds=[]
    for i, xstar in enumerate(Xstar):
        kern_i = kernelRFdist(X, xstar, rf)
        preds.append(local_linear(X=X, y=y, xstar=xstar, kern=kern_i.ravel()))
        
    return np.array(preds)

In [11]:
diabetes = load_diabetes()

In [12]:
X = diabetes["data"]
y = diabetes["target"]

In [13]:
X_train = X[:380]
y_train = y[:380]
X_test  = X[380:]
y_test  = y[380:]

### Testing functions

In [3]:
modRF = RandomForestRegressor()

In [14]:
modRF.fit(X_train, y_train)

RandomForestRegressor()

In [26]:
modXGB = XGBRFRegressor()

In [27]:
predsRF = barfKernel_reg(X=X_train, y=y_train, Xstar=X_test, mod=modRF)
predsXGB = barfKernel_reg(X=X_train, y=y_train, Xstar=X_test, mod=modXGB)

In [28]:
mean_squared_error(y_test, predsRF)

5892.641411573515

In [29]:
mean_squared_error(y_test, predsXGB)

3117.4206719490444