In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import sweetviz

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal, EmbeddableOneHotEncoder
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 6

setup_random_seed(RANDOM_SEED)

  the_open = pkg_resources.open_text("sweetviz", 'sweetviz_defaults.ini')


In [2]:
df = pd.read_csv('data/01_raw/CatData/sydney_house/SydneyHousePrices.csv')

In [3]:
x = df.drop(columns = ['Date', 'Id', 'sellPrice'])
y = np.log10(df[['sellPrice']])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)

In [5]:
x_train.shape, x_test.shape

((159603, 6), (39901, 6))

In [6]:
tree = EmbeddableOneHotEncoder(handle_unknown='ignore')

flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(200, 100, 100, 50), 
                                 num_blocks=5, context_dim=100, conditional=True)

treeflow = TreeFlowBoost(tree, flow, embedding_size=100)

In [7]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=30, batch_size=1024, verbose=True)

train loss: -0.5944143533706665
val loss: -0.5773606896400452
train loss: -0.6990615129470825
val loss: -0.6973952651023865
train loss: -0.6980648636817932
val loss: -0.702696681022644
train loss: -0.7341952919960022
val loss: -0.7150513529777527
train loss: -0.7289525866508484
val loss: -0.7285265326499939
train loss: -0.7520254254341125
val loss: -0.7270306944847107
train loss: -0.7512805461883545
val loss: -0.7472221851348877
train loss: -0.7484387755393982
val loss: -0.7244043946266174
train loss: -0.7585622668266296
val loss: -0.7461666464805603
train loss: -0.7700321674346924
val loss: -0.7539765238761902
train loss: -0.76412433385849
val loss: -0.7553876042366028
train loss: -0.7613753080368042
val loss: -0.7377872467041016
train loss: -0.7848649024963379
val loss: -0.75248122215271
train loss: -0.7779479622840881
val loss: -0.7524102330207825
train loss: -0.7632037997245789
val loss: -0.7426278591156006
train loss: -0.7723843455314636
val loss: -0.7416003346443176
train loss: -

TreeFlowBoost(embedding_size=100,
              flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f93bb412fd0>,
              tree_model=EmbeddableOneHotEncoder(handle_unknown='ignore'))

In [8]:
treeflow.flow_model.context_encoder

Sequential(
  (0): Linear(in_features=983, out_features=100, bias=True)
  (1): Tanh()
)

In [9]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 38.3 s, sys: 1.31 s, total: 39.6 s
Wall time: 33.7 s


-0.78461576

In [10]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 10.7 s, sys: 405 ms, total: 11.2 s
Wall time: 7.41 s


-0.74705976