In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import sweetviz

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal, EmbeddableOneHotEncoder
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 5

setup_random_seed(RANDOM_SEED)

  the_open = pkg_resources.open_text("sweetviz", 'sweetviz_defaults.ini')


In [2]:
df = pd.read_csv('data/01_raw/CatData/wine_reviews/winemag-data_first150k.csv', index_col=0)
df['country'] = df['country'].fillna('')
df['province'] = df['province'].fillna('')
df = df.dropna(subset = ['price'])

In [3]:
x = df.drop(columns = ['description', 'price', 'designation', 'region_1', 'region_2', 'winery'])
y = df[['price']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)

In [5]:
x_train.shape, x_test.shape

((109788, 4), (27447, 4))

In [6]:
tree = EmbeddableOneHotEncoder(handle_unknown='ignore')

flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(100, 100, 50), context_dim=100, conditional=True)

treeflow = TreeFlowBoost(tree, flow, embedding_size=100)

In [7]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=10, batch_size=2048, verbose=True)

train loss: 5.142801284790039
val loss: 4.985147476196289
train loss: 4.636468410491943
val loss: 4.588541030883789
train loss: 3.9806461334228516
val loss: 3.963313579559326
train loss: 3.880934476852417
val loss: 3.8397951126098633
train loss: 3.8730380535125732
val loss: 3.8558921813964844
train loss: 3.828885078430176
val loss: 3.8043487071990967
train loss: 3.800755023956299
val loss: 3.7826995849609375
train loss: 3.810332775115967
val loss: 3.7889678478240967
train loss: 3.791163682937622
val loss: 3.768038749694824
train loss: 3.7791450023651123
val loss: 3.7761735916137695
Loading model from epoch 8.
CPU times: user 5min 6s, sys: 8.96 s, total: 5min 15s
Wall time: 3min 24s


TreeFlowBoost(embedding_size=100,
              flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f9e94bf2b38>,
              tree_model=EmbeddableOneHotEncoder(handle_unknown='ignore'))

In [8]:
treeflow.flow_model.context_encoder

Sequential(
  (0): Linear(in_features=1064, out_features=100, bias=True)
  (1): Tanh()
)

In [9]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 16.8 s, sys: 894 ms, total: 17.7 s
Wall time: 12.9 s


3.8103957

In [10]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 6.39 s, sys: 337 ms, total: 6.72 s
Wall time: 3.3 s


3.8055122