In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import sweetviz

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal, EmbeddableOneHotEncoder
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 5

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/bigmart/bigmart.csv')
df['Outlet_Size'] = df['Outlet_Size'].fillna('')

In [3]:
x = df.drop(columns = ['Item_Identifier', 'Item_Outlet_Sales'])
y = np.log10(df[['Item_Outlet_Sales']])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=RANDOM_SEED)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=RANDOM_SEED)

In [5]:
x_train.shape, x_test.shape

((6818, 10), (1705, 10))

In [6]:
tree = EmbeddableOneHotEncoder(handle_unknown='ignore')

flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(200, 200, 100, 50), 
                                 num_blocks=5, context_dim=100, conditional=True)

treeflow = TreeFlowBoost(tree, flow, embedding_size=100)

In [7]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=20, batch_size=1024, verbose=True)

train loss: 1.8014585971832275
val loss: 0.6603795289993286
train loss: 0.7233642339706421
val loss: 0.7000609040260315
train loss: 0.5211040377616882
val loss: 0.5605458617210388
train loss: 0.46064162254333496
val loss: 0.525473415851593
train loss: 0.3091070055961609
val loss: 0.46055570244789124
train loss: 0.053367119282484055
val loss: 0.4002683758735657
train loss: -0.16838619112968445
val loss: 0.5585408806800842
train loss: -0.4493491053581238
val loss: 0.7991998791694641
train loss: -0.8942146897315979
val loss: 1.11782968044281
train loss: -1.0229706764221191
val loss: 1.1887085437774658
train loss: -0.9945207834243774
val loss: 1.3554311990737915
train loss: -1.1414368152618408
val loss: 1.626350998878479
train loss: -1.2368595600128174
val loss: 1.9534368515014648
train loss: -1.245989203453064
val loss: 1.5981853008270264
train loss: -1.2558151483535767
val loss: 1.4987298250198364
train loss: -1.2536827325820923
val loss: 1.4748587608337402
train loss: -1.380138993263244

TreeFlowBoost(embedding_size=100,
              flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f603e19e080>,
              tree_model=EmbeddableOneHotEncoder(handle_unknown='ignore'))

In [8]:
treeflow.flow_model.context_encoder

Sequential(
  (0): Linear(in_features=9831, out_features=100, bias=True)
  (1): Tanh()
)

In [9]:
calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

-0.6209551

In [10]:
calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

1.1821038