In [1]:
from typing import Union

import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import torch.nn as nn

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 1

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/pak-wheels/PakWheelsDataSet.csv', index_col=0)

In [3]:
x = df.drop(columns = ['Name', 'Price'])
y = np.log10(df[['Price']])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)

In [5]:
x_train.shape, x_test.shape

((61352, 7), (15338, 7))

In [6]:
class TreeFlowWithoutShallow(TreeFlowBoost):
    
    def fit(self, X: np.ndarray, y: np.ndarray, X_val: Union[np.ndarray, None] = None,
            y_val: Union[np.ndarray, None] = None, n_epochs: int = 100, batch_size: int = 1000, verbose: bool = False):
        self.tree_model.fit(X, y)

        context: np.ndarray = self.tree_model.embed(X)
        params: np.ndarray = self.tree_model.pred_dist_param(X)
        y: np.ndarray = y if len(y.shape) == 2 else y.reshape(-1, 1)

        if X_val is not None and y_val is not None:
            context_val: np.ndarray = self.tree_model.embed(X_val)
            params_val: np.ndarray = self.tree_model.pred_dist_param(X_val)
            y_val: np.ndarray = y_val if len(y_val.shape) == 2 else y_val.reshape(-1, 1)
        else:
            context_val = None
            params_val = None
            y_val = None

        self.flow_model.setup_context_encoder(nn.Identity())

        self.flow_model.fit(y, context, params, y_val, context_val, params_val, n_epochs=n_epochs,
                            batch_size=batch_size, verbose=verbose)
        return self


In [7]:
depth = 5
num_trees = 100
context_dim = num_trees*2**depth

tree = EmbeddableCatBoostPriorNormal(
    cat_features=[0, 1, 2, 5], 
    loss_function="RMSEWithUncertainty",
    depth=depth,
    num_trees=num_trees,
    random_state=RANDOM_SEED
)
flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(200, 200, 100, 50), 
                                 num_blocks=5, context_dim=context_dim, conditional=True)

treeflow = TreeFlowWithoutShallow(tree, flow, embedding_size=context_dim)

In [8]:
print(context_dim)

3200


In [9]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=50, batch_size=2048, verbose=True)

0:	learn: 0.2671206	total: 52.5ms	remaining: 5.2s
1:	learn: 0.2335249	total: 58.3ms	remaining: 2.86s
2:	learn: 0.1999514	total: 63ms	remaining: 2.04s
3:	learn: 0.1708209	total: 68.4ms	remaining: 1.64s
4:	learn: 0.1432203	total: 72.9ms	remaining: 1.39s
5:	learn: 0.1176186	total: 77.6ms	remaining: 1.22s
6:	learn: 0.0947260	total: 82.7ms	remaining: 1.1s
7:	learn: 0.0703019	total: 87.8ms	remaining: 1.01s
8:	learn: 0.0476655	total: 92.3ms	remaining: 933ms
9:	learn: 0.0261943	total: 96.9ms	remaining: 872ms
10:	learn: 0.0044785	total: 101ms	remaining: 821ms
11:	learn: -0.0152340	total: 106ms	remaining: 778ms
12:	learn: -0.0347889	total: 111ms	remaining: 742ms
13:	learn: -0.0547275	total: 116ms	remaining: 710ms
14:	learn: -0.0736284	total: 120ms	remaining: 682ms
15:	learn: -0.0938057	total: 125ms	remaining: 658ms
16:	learn: -0.1121780	total: 130ms	remaining: 634ms
17:	learn: -0.1316324	total: 135ms	remaining: 614ms
18:	learn: -0.1489333	total: 139ms	remaining: 595ms
19:	learn: -0.1667219	total

CPU times: user 2h 22min 23s, sys: 1min 7s, total: 2h 23min 30s
Wall time: 2h 12min 10s


TreeFlowWithoutShallow(embedding_size=3200,
                       flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f615c267e80>,
                       tree_model=<src.probabilistic_flow_boosting.tfboost.tree.ecatboost.EmbeddableCatBoostPriorNormal object at 0x7f615c267e48>)

In [10]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 1min 6s, sys: 1.89 s, total: 1min 8s
Wall time: 58.1 s


-1.7180606

In [11]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 21.3 s, sys: 722 ms, total: 22 s
Wall time: 15.7 s


-1.5729513