In [1]:
import pandas as pd
import numpy as np
from drf import drf

# generate data
n = 1000
p = 10
d = 2
X = np.random.normal(0, 1, size=(n, p))
Y = np.random.normal(0, 1, size=(n, d))
Y[:,0] = Y[:,0] + X[:,0] #mean shift of Y1 based on X1
Y[:,1] = Y[:,1] * X[:,1] #variance shift of Y2 based on X2
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

# fit model
DRF = drf(min_node_size = 15, num_trees = 2000, splitting_rule = "FourierMMD") #those are the default values
DRF.fit(X, Y)
DRF.info() #prints variable importance

#generate test data
X_test = pd.DataFrame(np.random.normal(0, 1, size=(100, p)))

# estimated conditional distribution represented via weights
out = DRF.predict(newdata = X_test)
print(out.weights)

# many distributional functionals are implemented and do not need to be manually computed from the weights  
out = DRF.predict(newdata = X_test, functional = "mean")
print(out.mean)

# covariance matrix at a fixed test point
out = DRF.predict(newdata = [0]*p, functional = "cov")
print(out.cov[0,:,:])

# we can transform the response beforehand to obtain more complicated quantities 
out = DRF.predict(newdata = X_test, functional = "quantile", transformation = lambda y: (np.sin(y[1]), y[1]*y[2], y[2]**2), quantiles=[0.1, 0.9])
print(out.quantile[0,1,:]) # 0.1 and 0.9 quantiles for first test point in newdata and for the second component of transformed y

# we automatically handle factor variables by using one-hot encoding
X['cat'] = np.random.choice(['a', 'b', 'c', 'd', 'e'], n, replace=True)
Y['new'] = np.random.normal(0, 1, size=n) + (X['cat']=='a')
DRF.fit(X, Y)
DRF.info()



DRF forest object
Number of trees: 2000 
Number of training samples: 1000 
Variable importance: 
    0     1     2     3     4     5     6     7     8     9 
0.778 0.141 0.009 0.009 0.010 0.008 0.014 0.012 0.010 0.009 
[[9.03133903e-05 3.12500000e-05 3.34361597e-03 ... 0.00000000e+00
  3.25657911e-03 0.00000000e+00]
 [9.44902429e-04 3.58999772e-04 3.14682233e-04 ... 0.00000000e+00
  0.00000000e+00 7.04545455e-05]
 [0.00000000e+00 0.00000000e+00 1.01750179e-02 ... 0.00000000e+00
  3.14705479e-03 0.00000000e+00]
 ...
 [1.48719336e-04 9.58333333e-05 0.00000000e+00 ... 4.85129555e-03
  0.00000000e+00 1.67384368e-03]
 [4.31563779e-04 0.00000000e+00 3.33333333e-05 ... 1.05072464e-04
  0.00000000e+00 6.27141070e-04]
 [6.02006689e-05 1.05555556e-04 1.12380528e-03 ... 0.00000000e+00
  3.03224551e-03 0.00000000e+00]]
[[-1.11780031e+00 -6.27962774e-02]
 [-7.20268007e-02 -1.16588535e-01]
 [-1.33346508e+00 -1.34843357e-01]
 [ 1.15696939e+00  4.86337609e-02]
 [-8.22497460e-02  1.16148964e-02]
 [ 2.3

  weighted_quantiles /= np.sum(sample_weight)


DRF forest object
Number of trees: 2000 
Number of training samples: 1000 
Variable importance: 
    0     1     2     3     4     5     6     7     8     9 cat_a cat_b cat_c 
0.633 0.121 0.012 0.015 0.012 0.015 0.014 0.022 0.015 0.015 0.123 0.001 0.001 
cat_d cat_e 
0.001 0.001 
