In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import sweetviz

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal, EmbeddableOneHotEncoder
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 5

setup_random_seed(RANDOM_SEED)

  the_open = pkg_resources.open_text("sweetviz", 'sweetviz_defaults.ini')


In [2]:
df = pd.read_csv('data/01_raw/CatData/pak-wheels/PakWheelsDataSet.csv', index_col=0)

In [3]:
x = df.drop(columns = ['Name', 'Price'])
y = np.log10(df[['Price']])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=RANDOM_SEED)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=RANDOM_SEED)

In [5]:
x_train.shape, x_test.shape

((61352, 7), (15338, 7))

In [6]:
tree = EmbeddableOneHotEncoder(handle_unknown='ignore')

flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(100, 100, 50), context_dim=100, conditional=True)

treeflow = TreeFlowBoost(tree, flow, embedding_size=100)

In [7]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=50, batch_size=2048, verbose=True)

train loss: 0.44481128454208374
val loss: 0.34301191568374634
train loss: 0.038846708834171295
val loss: 0.05016196519136429
train loss: -0.46047359704971313
val loss: -0.4586866497993469
train loss: -0.8465131521224976
val loss: -0.8040804266929626
train loss: -1.1236224174499512
val loss: -1.0377390384674072
train loss: -1.2605621814727783
val loss: -1.1550791263580322
train loss: -1.3348548412322998
val loss: -1.2264596223831177
train loss: -1.3685139417648315
val loss: -1.2428936958312988
train loss: -1.410986304283142
val loss: -1.2683095932006836
train loss: -1.4426792860031128
val loss: -1.2816362380981445
train loss: -1.4583978652954102
val loss: -1.2891217470169067
train loss: -1.487503170967102
val loss: -1.2996892929077148
train loss: -1.4975303411483765
val loss: -1.2928766012191772
train loss: -1.492586612701416
val loss: -1.3095659017562866
train loss: -1.5088372230529785
val loss: -1.314454197883606
train loss: -1.5323435068130493
val loss: -1.3472540378570557
train loss

TreeFlowBoost(embedding_size=100,
              flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7fee3cda6fd0>,
              tree_model=EmbeddableOneHotEncoder(handle_unknown='ignore'))

In [8]:
treeflow.flow_model.context_encoder

Sequential(
  (0): Linear(in_features=6382, out_features=100, bias=True)
  (1): Tanh()
)

In [9]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 14.5 s, sys: 2.02 s, total: 16.5 s
Wall time: 7.68 s


-1.7017101

In [10]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 4.79 s, sys: 544 ms, total: 5.34 s
Wall time: 1.92 s


-1.3777525