In [3]:
# importing the relevant packages
import pandas as pd
import numpy as np
from drf import drf

# generate data
n = 1000
p = 10
d = 2
X = np.random.normal(0, 1, size=(n, p))
Y = np.random.normal(0, 1, size=(n, d))
Y[:,0] = Y[:,0] + X[:,0] #mean shift of Y1 based on X1
Y[:,1] = Y[:,1] * X[:,1] #variance shift of Y2 based on X2
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

# fit model
DRF = drf(min_node_size = 15, num_trees = 2000, splitting_rule = "FourierMMD") #those are the default values
DRF.fit(X, Y)
DRF.info() #prints variable importance

#generate test data
X_test = pd.DataFrame(np.random.normal(0, 1, size=(100, p)))

# estimated conditional distribution represented via weights
out = DRF.predict(newdata = X_test)
print(out.weights)

# many distributional functionals are implemented and do not need to be manually computed from the weights  
out = DRF.predict(newdata = X_test, functional = "mean")
print(out.mean)

# covariance matrix at a fixed test point
out = DRF.predict(newdata = [0]*p, functional = "cov")
print(out.cov[0,:,:])

# we can transform the response beforehand to obtain more complicated quantities 
out = DRF.predict(newdata = X_test, functional = "quantile", transformation = lambda y: (np.sin(y[1]), y[1]*y[2], y[2]**2), quantiles=[0.1, 0.9])
print(out.quantile[0,1,:]) # 0.1 and 0.9 quantiles for first test point in newdata and for the second component of transformed y

# we automatically handle factor variables by using one-hot encoding
X['cat'] = np.random.choice(['a', 'b', 'c', 'd', 'e'], n, replace=True)
Y['new'] = np.random.normal(0, 1, size=n) + (X['cat']=='a')
DRF.fit(X, Y)
DRF.info()

DRF forest object
Number of trees: 2000 
Number of training samples: 1000 
[[7.94504921e-04 0.00000000e+00 0.00000000e+00 ... 5.02583928e-04
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 4.32193131e-03 7.84548269e-05 ... 0.00000000e+00
  3.44083682e-03 0.00000000e+00]
 [0.00000000e+00 1.40029900e-02 1.50297619e-04 ... 0.00000000e+00
  6.04873313e-04 3.81880163e-03]
 ...
 [7.79666064e-04 6.69829680e-04 0.00000000e+00 ... 0.00000000e+00
  3.02938189e-03 0.00000000e+00]
 [8.04020585e-03 0.00000000e+00 1.59304299e-04 ... 7.69235778e-04
  0.00000000e+00 0.00000000e+00]
 [5.00621345e-04 0.00000000e+00 0.00000000e+00 ... 1.24175824e-04
  7.45016789e-04 0.00000000e+00]]
[[-8.67912765e-01 -1.26032264e-02]
 [ 3.51985575e-01  1.17479221e-01]
 [ 1.07460264e+00 -3.73910151e-02]
 [-4.80459503e-01  8.30323351e-02]
 [-1.21894997e-01  1.31281434e-01]
 [-7.35486760e-01 -1.29171479e-02]
 [-1.02916305e-01 -9.86734580e-04]
 [-6.35535480e-01 -3.05823531e-03]
 [-2.35792841e-01 -8.32216283e-03]
 [-4.03697

In [1]:
import pandas as pd
import numpy as np
from drf import drf

# generate data
n = 1000
p = 10
d = 2
X = np.random.normal(0, 1, size=(n, p))
Y = np.random.normal(0, 1, size=(n, d))
Y[:,0] = Y[:,0] + X[:,0] #mean shift of Y1 based on X1
Y[:,1] = Y[:,1] * X[:,1] #variance shift of Y2 based on X2
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

# fit model
DRF = drf(min_node_size = 15, num_trees = 2000, splitting_rule = "FourierMMD") #those are the default values
DRF.fit(X, Y)
DRF.info() #prints variable importance



DRF forest object
Number of trees: 2000 
Number of training samples: 1000 


In [2]:

#generate test data
X_test = pd.DataFrame(np.random.normal(0, 1, size=(100, p)))

# estimated conditional distribution represented via weights
out = DRF.predict(newdata = X_test)
print(out.weights)

# many distributional functionals are implemented and do not need to be manually computed from the weights  
out = DRF.predict(newdata = X_test, functional = "mean")
print(out.mean)

# covariance matrix at a fixed test point
out = DRF.predict(newdata = [0]*p, functional = "cov")
print(out.cov[0,:,:])

# we can transform the response beforehand to obtain more complicated quantities 
out = DRF.predict(newdata = X_test, functional = "quantile", transformation = lambda y: (np.sin(y[1]), y[1]*y[2], y[2]**2), quantiles=[0.1, 0.9])
print(out.quantile[0,1,:]) # 0.1 and 0.9 quantiles for first test point in newdata and for the second component of transformed y

# we automatically handle factor variables by using one-hot encoding
X['cat'] = np.random.choice(['a', 'b', 'c', 'd', 'e'], n, replace=True)
Y['new'] = np.random.normal(0, 1, size=n) + (X['cat']=='a')
DRF.fit(X, Y)
DRF.info()

[[1.03024194e-03 2.32845171e-03 0.00000000e+00 ... 4.46632088e-04
  2.77858313e-03 2.03489383e-03]
 [1.67875180e-04 0.00000000e+00 1.76864746e-03 ... 1.70661028e-04
  3.93518519e-05 0.00000000e+00]
 [3.57142857e-05 1.12117666e-02 0.00000000e+00 ... 4.25431023e-04
  6.29849217e-05 1.73068851e-03]
 ...
 [2.94117647e-05 0.00000000e+00 9.74997173e-04 ... 6.85989953e-04
  8.23488266e-04 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 5.97771618e-04 ... 7.87878788e-05
  1.75000000e-04 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 2.09624145e-04 ... 7.48663102e-05
  0.00000000e+00 0.00000000e+00]]
[[-8.85731422e-01  2.77982429e-02]
 [-2.52956911e-01  1.57938483e-02]
 [-1.26308230e+00 -6.01654250e-02]
 [-8.53293982e-01  1.23491263e-02]
 [-2.17160554e-01  2.17156831e-02]
 [ 8.12199209e-02  3.43677118e-03]
 [-1.48044522e-02 -5.67703748e-02]
 [-5.86000592e-01  3.98756196e-02]
 [-3.06919540e-01  3.52048485e-02]
 [ 1.05983270e+00 -3.01723435e-03]
 [ 9.41300168e-01 -1.40038322e-02]
 [-1.08569570e+

  weighted_quantiles /= np.sum(sample_weight)


[-0.47351725  0.87475032]
DRF forest object
Number of trees: 2000 
Number of training samples: 1000 
