In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import sweetviz

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal, EmbeddableOneHotEncoder
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 5

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/diamonds/diamonds.csv', index_col=0)

In [3]:
x = df.drop(columns = ['price'])
y = np.log10(df[['price']])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=RANDOM_SEED)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=RANDOM_SEED)

In [5]:
x_train.shape, x_test.shape

((43152, 9), (10788, 9))

In [6]:
tree = EmbeddableOneHotEncoder(handle_unknown='ignore')

flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(200, 100, 100, 50), 
                                 num_blocks=5, context_dim=100, conditional=True)

treeflow = TreeFlowBoost(tree, flow, embedding_size=100)

In [7]:
treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=30, batch_size=1024, verbose=True)

train loss: 0.25159910321235657
val loss: 0.2535964548587799
train loss: -1.2872874736785889
val loss: -1.2271534204483032
train loss: -1.5062850713729858
val loss: -1.4050979614257812
train loss: -1.576640009880066
val loss: -1.4800983667373657
train loss: -1.6313048601150513
val loss: -1.5697342157363892
train loss: -1.6580137014389038
val loss: -1.5726337432861328
train loss: -1.681809902191162
val loss: -1.5855497121810913
train loss: -1.6887626647949219
val loss: -1.5832712650299072
train loss: -1.7148464918136597
val loss: -1.6190495491027832
train loss: -1.710597038269043
val loss: -1.624446988105774
train loss: -1.7448806762695312
val loss: -1.6535016298294067
train loss: -1.7591785192489624
val loss: -1.6457291841506958
train loss: -1.7624907493591309
val loss: -1.6599860191345215
train loss: -1.7715531587600708
val loss: -1.6820719242095947
train loss: -1.8196821212768555
val loss: -1.7107113599777222
train loss: -1.8270525932312012
val loss: -1.6838397979736328
train loss: -

TreeFlowBoost(embedding_size=100,
              flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f590ae76358>,
              tree_model=EmbeddableOneHotEncoder(handle_unknown='ignore'))

In [8]:
treeflow.flow_model.context_encoder

Sequential(
  (0): Linear(in_features=2010, out_features=100, bias=True)
  (1): Tanh()
)

In [9]:
calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

-1.8927356

In [10]:
calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

-1.7069154