In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import sweetviz

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal, EmbeddableOneHotEncoder
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 5

setup_random_seed(RANDOM_SEED)

  the_open = pkg_resources.open_text("sweetviz", 'sweetviz_defaults.ini')


In [2]:
df = pd.read_csv('data/01_raw/CatData/avocado/avocado.csv', index_col=0)

In [3]:
x = df.drop(columns = ['Date', 'AveragePrice'])
y = df[['AveragePrice']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=RANDOM_SEED)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=RANDOM_SEED)

In [5]:
x_train.shape, x_test.shape

((14599, 11), (3650, 11))

In [6]:
tree = EmbeddableOneHotEncoder(handle_unknown='ignore')

flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(200, 200, 100, 50), 
                                 num_blocks=5, context_dim=100, conditional=True)

treeflow = TreeFlowBoost(tree, flow, embedding_size=100)

In [7]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=50, batch_size=2048, verbose=True)

train loss: 0.7906872034072876
val loss: 0.5152111649513245
train loss: 0.41929954290390015
val loss: 0.46670013666152954
train loss: 0.11511001735925674
val loss: 0.43532973527908325
train loss: -0.3087172210216522
val loss: 0.8091915845870972
train loss: -0.7739049196243286
val loss: 1.4377697706222534
train loss: -1.3526742458343506
val loss: 2.2293899059295654
train loss: -1.0905277729034424
val loss: 1.1892861127853394
train loss: -0.9489802718162537
val loss: 1.0897233486175537
train loss: -1.143115520477295
val loss: 1.49009108543396
train loss: -1.338901400566101
val loss: 1.7425740957260132
train loss: -1.4264782667160034
val loss: 1.709668755531311
train loss: -1.3676371574401855
val loss: 1.5509556531906128
train loss: -1.3279303312301636
val loss: 1.5048450231552124
train loss: -1.2836774587631226
val loss: 1.5272753238677979
train loss: -1.3537825345993042
val loss: 1.6036006212234497
train loss: -1.3392995595932007
val loss: 1.6194915771484375
train loss: -1.3009791374206

TreeFlowBoost(embedding_size=100,
              flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f299caaf710>,
              tree_model=EmbeddableOneHotEncoder(handle_unknown='ignore'))

In [8]:
treeflow.flow_model.context_encoder

Sequential(
  (0): Linear(in_features=78795, out_features=100, bias=True)
  (1): Tanh()
)

In [9]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 8.08 s, sys: 5.12 s, total: 13.2 s
Wall time: 5.29 s


-0.9765982

In [10]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 4.37 s, sys: 1.32 s, total: 5.69 s
Wall time: 1.46 s


1.4423742