In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import sweetviz

from sklearn.model_selection import train_test_split
from nflows.distributions import ConditionalDiagonalNormal

from src.probabilistic_flow_boosting.tfboost.tree import EmbeddableCatBoostPriorNormal, EmbeddableOneHotEncoder
from src.probabilistic_flow_boosting.tfboost.tfboost import TreeFlowBoost
from src.probabilistic_flow_boosting.tfboost.flow import ContinuousNormalizingFlow
from src.probabilistic_flow_boosting.pipelines.reporting.nodes import calculate_nll

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 5

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/laptop/laptop_price.csv', index_col=0, engine='python')

In [3]:
df['Weight'] = pd.to_numeric(df['Weight'].str.replace('kg', ''))
df['Ram'] = pd.to_numeric(df['Ram'].str.replace('GB', ''))

In [4]:
x = df.drop(columns = ['Product', 'Price_euros'])
y = np.log10(df[['Price_euros']])

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)

In [6]:
x_train.shape, x_test.shape

((1042, 10), (261, 10))

In [7]:
tree = EmbeddableOneHotEncoder(handle_unknown='ignore')

flow = ContinuousNormalizingFlow(input_dim=1, hidden_dims=(100, 100, 50), context_dim=100, conditional=True)

treeflow = TreeFlowBoost(tree, flow, embedding_size=100)

In [8]:
%time treeflow.fit(x_tr.values, y_tr.values, x_val.values, y_val.values, n_epochs=100, batch_size=1024, verbose=True)

train loss: 3.2222917079925537
val loss: 1.7729004621505737
train loss: 0.38600438833236694
val loss: 0.30256685614585876
train loss: 0.7481980919837952
val loss: 1.1910451650619507
train loss: 1.443959355354309
val loss: 1.712947964668274
train loss: 1.1559293270111084
val loss: 1.1653051376342773
train loss: 0.5228358507156372
val loss: 0.494785338640213
train loss: 0.16350610554218292
val loss: 0.15999746322631836
train loss: 0.06433457136154175
val loss: 0.06044280156493187
train loss: 0.0651116594672203
val loss: 0.05712459236383438
train loss: 0.08496250957250595
val loss: 0.07111535966396332
train loss: 0.09465315192937851
val loss: 0.07099626213312149
train loss: 0.08225344866514206
val loss: 0.04929893836379051
train loss: 0.051612358540296555
val loss: 0.03311000391840935
train loss: 0.02245158702135086
val loss: 0.00802470650523901
train loss: -0.009251898154616356
val loss: -0.015717990696430206
train loss: -0.039429083466529846
val loss: -0.04586835205554962
train loss: -0

TreeFlowBoost(embedding_size=100,
              flow_model=<src.probabilistic_flow_boosting.tfboost.flow.flow.ContinuousNormalizingFlow object at 0x7f36599b20f0>,
              tree_model=EmbeddableOneHotEncoder(handle_unknown='ignore'))

In [9]:
treeflow.flow_model.context_encoder

Sequential(
  (0): Linear(in_features=470, out_features=100, bias=True)
  (1): Tanh()
)

In [10]:
%time calculate_nll(treeflow, x_train, y_train, batch_size = 1024)

CPU times: user 3.24 s, sys: 56.8 ms, total: 3.3 s
Wall time: 213 ms


-1.4508755

In [11]:
%time calculate_nll(treeflow, x_test, y_test, batch_size = 1024)

CPU times: user 1.46 s, sys: 32.5 ms, total: 1.5 s
Wall time: 97.2 ms


-0.40516832