In [1]:
from typing import Union

import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import torch.nn as nn

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 1

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/avocado/avocado.csv', index_col=0)

In [3]:
x = df.drop(columns = ['Date', 'AveragePrice'])
y = df[['AveragePrice']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=RANDOM_SEED)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=RANDOM_SEED)

In [5]:
x_train.shape, x_test.shape

((14599, 11), (3650, 11))

In [6]:
class TreeFlowWithoutShallow(TreeFlowBoost):
    
    def fit(self, X: np.ndarray, y: np.ndarray, X_val: Union[np.ndarray, None] = None,
            y_val: Union[np.ndarray, None] = None, n_epochs: int = 100, batch_size: int = 1000, verbose: bool = False):
        self.tree_model.fit(X, y)

        context: np.ndarray = self.tree_model.embed(X)
        params: np.ndarray = self.tree_model.pred_dist_param(X)
        y: np.ndarray = y if len(y.shape) == 2 else y.reshape(-1, 1)

        if X_val is not None and y_val is not None:
            context_val: np.ndarray = self.tree_model.embed(X_val)
            params_val: np.ndarray = self.tree_model.pred_dist_param(X_val)
            y_val: np.ndarray = y_val if len(y_val.shape) == 2 else y_val.reshape(-1, 1)
        else:
            context_val = None
            params_val = None
            y_val = None

        self.flow_model.setup_context_encoder(nn.Identity())

        self.flow_model.fit(y, context, params, y_val, context_val, params_val, n_epochs=n_epochs,
                            batch_size=batch_size, verbose=verbose)
        return self


In [7]:
depth = 4
num_trees = 200
context_dim = num_trees*2**depth

tree = EmbeddableCatBoostPriorNormal(
    cat_features=[8, 9, 10], 
    loss_function="RMSEWithUncertainty",
    depth=depth,
    num_trees=num_trees,
    random_state=RANDOM_SEED
)
flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(200, 200, 100, 50), 
                                 num_blocks=5, context_dim=context_dim, conditional=True)

treeflow = TreeFlowWithoutShallow(tree, flow, embedding_size=context_dim)

In [8]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=50, batch_size=2048, verbose=True)

0:	learn: 0.4892131	total: 49.9ms	remaining: 9.93s
1:	learn: 0.4702040	total: 52.4ms	remaining: 5.19s
2:	learn: 0.4544002	total: 55.8ms	remaining: 3.66s
3:	learn: 0.4362039	total: 57.8ms	remaining: 2.83s
4:	learn: 0.4213793	total: 59.6ms	remaining: 2.33s
5:	learn: 0.4113954	total: 61.9ms	remaining: 2s
6:	learn: 0.3993450	total: 66.3ms	remaining: 1.83s
7:	learn: 0.3846910	total: 68.3ms	remaining: 1.64s
8:	learn: 0.3729961	total: 70.7ms	remaining: 1.5s
9:	learn: 0.3596475	total: 72.8ms	remaining: 1.38s
10:	learn: 0.3505021	total: 75.3ms	remaining: 1.29s
11:	learn: 0.3393932	total: 77.3ms	remaining: 1.21s
12:	learn: 0.3286371	total: 79.1ms	remaining: 1.14s
13:	learn: 0.3183802	total: 81.1ms	remaining: 1.08s
14:	learn: 0.3108530	total: 83.5ms	remaining: 1.03s
15:	learn: 0.2997345	total: 85.6ms	remaining: 984ms
16:	learn: 0.2912839	total: 88.1ms	remaining: 949ms
17:	learn: 0.2822854	total: 90.5ms	remaining: 915ms
18:	learn: 0.2744765	total: 93.1ms	remaining: 887ms
19:	learn: 0.2648132	total

162:	learn: -0.1042417	total: 446ms	remaining: 101ms
163:	learn: -0.1046429	total: 448ms	remaining: 98.3ms
164:	learn: -0.1051321	total: 450ms	remaining: 95.5ms
165:	learn: -0.1058995	total: 452ms	remaining: 92.6ms
166:	learn: -0.1069022	total: 454ms	remaining: 89.7ms
167:	learn: -0.1075469	total: 457ms	remaining: 87ms
168:	learn: -0.1084916	total: 459ms	remaining: 84.3ms
169:	learn: -0.1091531	total: 462ms	remaining: 81.5ms
170:	learn: -0.1096737	total: 464ms	remaining: 78.7ms
171:	learn: -0.1107126	total: 467ms	remaining: 76ms
172:	learn: -0.1112295	total: 469ms	remaining: 73.2ms
173:	learn: -0.1118014	total: 472ms	remaining: 70.5ms
174:	learn: -0.1126708	total: 474ms	remaining: 67.7ms
175:	learn: -0.1134526	total: 476ms	remaining: 64.9ms
176:	learn: -0.1138200	total: 478ms	remaining: 62.1ms
177:	learn: -0.1142904	total: 480ms	remaining: 59.3ms
178:	learn: -0.1149249	total: 482ms	remaining: 56.6ms
179:	learn: -0.1152136	total: 485ms	remaining: 53.9ms
180:	learn: -0.1159609	total: 487

TreeFlowWithoutShallow(embedding_size=3200,
                       flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f941fec62e8>,
                       tree_model=<src.probabilistic_flow_boosting.tfboost.tree.ecatboost.EmbeddableCatBoostPriorNormal object at 0x7f941fec62b0>)

In [9]:
print(context_dim)

3200


In [10]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 16.9 s, sys: 800 ms, total: 17.7 s
Wall time: 11.3 s


-0.9891092

In [11]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 7.8 s, sys: 376 ms, total: 8.18 s
Wall time: 3.13 s


-0.49962202