In [1]:
from typing import Union

import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import torch.nn as nn

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 1

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/sydney_house/SydneyHousePrices.csv')

In [3]:
x = df.drop(columns = ['Date', 'Id', 'sellPrice'])
y = np.log10(df[['sellPrice']])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)

In [5]:
x_train.shape, x_test.shape

((159603, 6), (39901, 6))

In [6]:
class TreeFlowWithoutShallow(TreeFlowBoost):
    
    def fit(self, X: np.ndarray, y: np.ndarray, X_val: Union[np.ndarray, None] = None,
            y_val: Union[np.ndarray, None] = None, n_epochs: int = 100, batch_size: int = 1000, verbose: bool = False):
        self.tree_model.fit(X, y)

        context: np.ndarray = self.tree_model.embed(X)
        params: np.ndarray = self.tree_model.pred_dist_param(X)
        y: np.ndarray = y if len(y.shape) == 2 else y.reshape(-1, 1)

        if X_val is not None and y_val is not None:
            context_val: np.ndarray = self.tree_model.embed(X_val)
            params_val: np.ndarray = self.tree_model.pred_dist_param(X_val)
            y_val: np.ndarray = y_val if len(y_val.shape) == 2 else y_val.reshape(-1, 1)
        else:
            context_val = None
            params_val = None
            y_val = None

        self.flow_model.setup_context_encoder(nn.Identity())

        self.flow_model.fit(y, context, params, y_val, context_val, params_val, n_epochs=n_epochs,
                            batch_size=batch_size, verbose=verbose)
        return self


In [7]:
depth = 4
num_trees = 200
context_dim = num_trees*2**depth

tree = EmbeddableCatBoostPriorNormal(
    cat_features=[0, 1, 5], 
    loss_function="RMSEWithUncertainty",
    depth=depth,
    num_trees=num_trees,
    random_state=RANDOM_SEED
)
flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(200, 200, 100, 50), 
                                 num_blocks=5, context_dim=context_dim, conditional=True)

treeflow = TreeFlowWithoutShallow(tree, flow, embedding_size=context_dim)

In [8]:
print(context_dim)

3200


In [9]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=30, batch_size=1024, verbose=True)

0:	learn: 0.0035320	total: 67.4ms	remaining: 13.4s
1:	learn: -0.0151538	total: 86.4ms	remaining: 8.55s
2:	learn: -0.0336699	total: 104ms	remaining: 6.84s
3:	learn: -0.0495297	total: 117ms	remaining: 5.72s
4:	learn: -0.0647242	total: 130ms	remaining: 5.05s
5:	learn: -0.0762334	total: 145ms	remaining: 4.69s
6:	learn: -0.0890393	total: 157ms	remaining: 4.32s
7:	learn: -0.1007857	total: 167ms	remaining: 4s
8:	learn: -0.1102937	total: 177ms	remaining: 3.76s
9:	learn: -0.1220026	total: 190ms	remaining: 3.62s
10:	learn: -0.1324600	total: 203ms	remaining: 3.48s
11:	learn: -0.1430619	total: 217ms	remaining: 3.39s
12:	learn: -0.1520155	total: 228ms	remaining: 3.28s
13:	learn: -0.1621167	total: 243ms	remaining: 3.22s
14:	learn: -0.1714296	total: 258ms	remaining: 3.19s
15:	learn: -0.1800285	total: 271ms	remaining: 3.11s
16:	learn: -0.1886189	total: 284ms	remaining: 3.06s
17:	learn: -0.1968616	total: 302ms	remaining: 3.05s
18:	learn: -0.2041539	total: 311ms	remaining: 2.97s
19:	learn: -0.2099215	to

171:	learn: -0.4760313	total: 2.13s	remaining: 346ms
172:	learn: -0.4765562	total: 2.14s	remaining: 334ms
173:	learn: -0.4769260	total: 2.15s	remaining: 321ms
174:	learn: -0.4773330	total: 2.16s	remaining: 308ms
175:	learn: -0.4780509	total: 2.17s	remaining: 296ms
176:	learn: -0.4788050	total: 2.18s	remaining: 283ms
177:	learn: -0.4796816	total: 2.19s	remaining: 271ms
178:	learn: -0.4804328	total: 2.2s	remaining: 259ms
179:	learn: -0.4810728	total: 2.22s	remaining: 246ms
180:	learn: -0.4817289	total: 2.23s	remaining: 234ms
181:	learn: -0.4820343	total: 2.24s	remaining: 222ms
182:	learn: -0.4821689	total: 2.25s	remaining: 209ms
183:	learn: -0.4829828	total: 2.26s	remaining: 197ms
184:	learn: -0.4831353	total: 2.27s	remaining: 184ms
185:	learn: -0.4839128	total: 2.29s	remaining: 172ms
186:	learn: -0.4842209	total: 2.3s	remaining: 160ms
187:	learn: -0.4843860	total: 2.31s	remaining: 148ms
188:	learn: -0.4848666	total: 2.33s	remaining: 135ms
189:	learn: -0.4853992	total: 2.34s	remaining: 1

TreeFlowWithoutShallow(embedding_size=3200,
                       flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7efcd994a908>,
                       tree_model=<src.probabilistic_flow_boosting.tfboost.tree.ecatboost.EmbeddableCatBoostPriorNormal object at 0x7efcd994a8d0>)

In [10]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 8min 8s, sys: 5.33 s, total: 8min 14s
Wall time: 7min 59s


-0.78106695

In [12]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 2min 19s, sys: 1.64 s, total: 2min 21s
Wall time: 2min 11s


-0.7062259