In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import sweetviz

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal, EmbeddableOneHotEncoder
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 5

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/diamonds2/diamonds_dataset.csv')

In [3]:
x = df.drop(columns = ['id', 'url', 'price', 'date_fetched'])
y = np.log10(df[['price']])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)

In [5]:
x_train.shape, x_test.shape

((95445, 7), (23862, 7))

In [6]:
tree = EmbeddableOneHotEncoder(handle_unknown='ignore')

flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(100, 100, 50), context_dim=100, num_blocks=5, conditional=True)

treeflow = TreeFlowBoost(tree, flow, embedding_size=100)

In [7]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=30, batch_size=2048, verbose=True)

train loss: 0.49053284525871277
val loss: 0.4877220392227173
train loss: -0.12036976963281631
val loss: -0.09005609154701233
train loss: -1.3124052286148071
val loss: -1.2509196996688843
train loss: -1.465296983718872
val loss: -1.4238975048065186
train loss: -1.5932127237319946
val loss: -1.5700442790985107
train loss: -1.6743978261947632
val loss: -1.6016372442245483
train loss: -1.7202945947647095
val loss: -1.6876670122146606
train loss: -1.7367184162139893
val loss: -1.6935948133468628
train loss: -1.7682219743728638
val loss: -1.6992747783660889
train loss: -1.7774919271469116
val loss: -1.7305521965026855
train loss: -1.8075511455535889
val loss: -1.754086971282959
train loss: -1.822334885597229
val loss: -1.7573074102401733
train loss: -1.8080549240112305
val loss: -1.7386784553527832
train loss: -1.8101378679275513
val loss: -1.7380850315093994
train loss: -1.8186490535736084
val loss: -1.8118560314178467
train loss: -1.8749128580093384
val loss: -1.767313003540039
train loss:

TreeFlowBoost(embedding_size=100,
              flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f932a6f7c18>,
              tree_model=EmbeddableOneHotEncoder(handle_unknown='ignore'))

In [8]:
treeflow.flow_model.context_encoder

Sequential(
  (0): Linear(in_features=502, out_features=100, bias=True)
  (1): Tanh()
)

In [9]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 18 s, sys: 522 ms, total: 18.6 s
Wall time: 14.7 s


-1.9325382

In [10]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 6.8 s, sys: 181 ms, total: 6.98 s
Wall time: 3.71 s


-1.8576276