In [1]:
from typing import Union

import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import torch.nn as nn

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 1

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/wine_reviews/winemag-data_first150k.csv', index_col=0)
df['country'] = df['country'].fillna('')
df['province'] = df['province'].fillna('')
df = df.dropna(subset = ['price'])

In [3]:
x = df.drop(columns = ['description', 'price', 'designation', 'region_1', 'region_2', 'winery'])
y = df[['price']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)

In [5]:
x_train.shape, x_test.shape

((109788, 4), (27447, 4))

In [6]:
class TreeFlowWithoutShallow(TreeFlowBoost):
    
    def fit(self, X: np.ndarray, y: np.ndarray, X_val: Union[np.ndarray, None] = None,
            y_val: Union[np.ndarray, None] = None, n_epochs: int = 100, batch_size: int = 1000, verbose: bool = False):
        self.tree_model.fit(X, y)

        context: np.ndarray = self.tree_model.embed(X)
        params: np.ndarray = self.tree_model.pred_dist_param(X)
        y: np.ndarray = y if len(y.shape) == 2 else y.reshape(-1, 1)

        if X_val is not None and y_val is not None:
            context_val: np.ndarray = self.tree_model.embed(X_val)
            params_val: np.ndarray = self.tree_model.pred_dist_param(X_val)
            y_val: np.ndarray = y_val if len(y_val.shape) == 2 else y_val.reshape(-1, 1)
        else:
            context_val = None
            params_val = None
            y_val = None

        self.flow_model.setup_context_encoder(nn.Identity())

        self.flow_model.fit(y, context, params, y_val, context_val, params_val, n_epochs=n_epochs,
                            batch_size=batch_size, verbose=verbose)
        return self


In [7]:
depth = 3
num_trees = 100
context_dim = num_trees*2**depth

tree = EmbeddableCatBoostPriorNormal(
    cat_features=[0, 2, 3], 
    loss_function="RMSEWithUncertainty",
    depth=depth,
    num_trees=num_trees,
    random_state=RANDOM_SEED
)
flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(200, 200, 100, 50), 
                                 num_blocks=5, context_dim=context_dim, conditional=True)

treeflow = TreeFlowWithoutShallow(tree, flow, embedding_size=context_dim)

In [8]:
print(context_dim)

800


In [9]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=10, batch_size=2048, verbose=True)

0:	learn: 4.8688980	total: 54.8ms	remaining: 5.42s
1:	learn: 4.8202157	total: 61.3ms	remaining: 3s
2:	learn: 4.7966345	total: 66.7ms	remaining: 2.16s
3:	learn: 4.7723665	total: 72.2ms	remaining: 1.73s
4:	learn: 4.7528980	total: 79.2ms	remaining: 1.5s
5:	learn: 4.7374487	total: 85.4ms	remaining: 1.34s
6:	learn: 4.7239708	total: 91.1ms	remaining: 1.21s
7:	learn: 4.7077963	total: 97ms	remaining: 1.11s
8:	learn: 4.6959009	total: 103ms	remaining: 1.04s
9:	learn: 4.6824217	total: 109ms	remaining: 978ms
10:	learn: 4.6691157	total: 115ms	remaining: 927ms
11:	learn: 4.6575242	total: 120ms	remaining: 882ms
12:	learn: 4.6476490	total: 128ms	remaining: 855ms
13:	learn: 4.6366159	total: 135ms	remaining: 828ms
14:	learn: 4.6277618	total: 140ms	remaining: 796ms
15:	learn: 4.6170499	total: 147ms	remaining: 770ms
16:	learn: 4.6090884	total: 153ms	remaining: 745ms
17:	learn: 4.5992576	total: 158ms	remaining: 722ms
18:	learn: 4.5899134	total: 164ms	remaining: 701ms
19:	learn: 4.5834340	total: 172ms	remai

TreeFlowWithoutShallow(embedding_size=800,
                       flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f10e9d1d550>,
                       tree_model=<src.probabilistic_flow_boosting.tfboost.tree.ecatboost.EmbeddableCatBoostPriorNormal object at 0x7f10e5a85eb8>)

In [10]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 54.9 s, sys: 1.33 s, total: 56.2 s
Wall time: 47.3 s


3.8344364

In [11]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 18.3 s, sys: 569 ms, total: 18.9 s
Wall time: 12.2 s


3.8115282