In [1]:
import warnings

warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import ngboost as ng
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_diabetes

from models.flow import build_model
from tfboost.flow import ContinuousNormalizingFlow
from tfboost.tree.engboost import EmbeddableNGBoost, EmbeddableNGBoost2, EmbeddableNGBoostDecisionPath
from tfboost.tree.ecatboost import EmbeddableCatBoost
from tfboost.tfboost import TreeFlowBoost

In [2]:
x, y  = load_diabetes(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y)

# Embeddable CatBoost - N(0, 1) prior

In [3]:
flow = ContinuousNormalizingFlow(build_model(
    input_dim=1,
    hidden_dims=(80, 40),
    context_dim=100,
    conditional=True,
))

tree = EmbeddableCatBoost(max_depth=3)

tfb = TreeFlowBoost(flow_model=flow, tree_model = tree, embedding_size=100)

In [4]:
tfb.fit(x_train, y_train, n_epochs=200)

Learning rate set to 0.032058
0:	learn: 76.5800587	total: 47ms	remaining: 46.9s
1:	learn: 75.6487922	total: 47.4ms	remaining: 23.7s
2:	learn: 74.8746550	total: 47.7ms	remaining: 15.9s
3:	learn: 74.0891555	total: 48ms	remaining: 12s
4:	learn: 73.4654056	total: 48.4ms	remaining: 9.63s
5:	learn: 72.7896600	total: 48.7ms	remaining: 8.07s
6:	learn: 72.0796583	total: 49ms	remaining: 6.94s
7:	learn: 71.3634633	total: 49.2ms	remaining: 6.1s
8:	learn: 70.6331105	total: 49.5ms	remaining: 5.45s
9:	learn: 70.0569238	total: 49.8ms	remaining: 4.92s
10:	learn: 69.4172650	total: 50ms	remaining: 4.5s
11:	learn: 68.9826579	total: 50.3ms	remaining: 4.14s
12:	learn: 68.4232806	total: 50.5ms	remaining: 3.83s
13:	learn: 67.8043415	total: 50.8ms	remaining: 3.57s
14:	learn: 67.3234113	total: 51ms	remaining: 3.35s
15:	learn: 66.7294199	total: 51.3ms	remaining: 3.15s
16:	learn: 66.2373244	total: 51.6ms	remaining: 2.98s
17:	learn: 65.7778657	total: 51.9ms	remaining: 2.83s
18:	learn: 65.4015511	total: 52.2ms	rema

502:	learn: 40.3958077	total: 169ms	remaining: 167ms
503:	learn: 40.3507114	total: 169ms	remaining: 167ms
504:	learn: 40.3235752	total: 170ms	remaining: 166ms
505:	learn: 40.2927164	total: 170ms	remaining: 166ms
506:	learn: 40.2367368	total: 170ms	remaining: 165ms
507:	learn: 40.2288048	total: 170ms	remaining: 165ms
508:	learn: 40.1949172	total: 171ms	remaining: 165ms
509:	learn: 40.1716763	total: 171ms	remaining: 164ms
510:	learn: 40.1606036	total: 171ms	remaining: 164ms
511:	learn: 40.1408086	total: 171ms	remaining: 163ms
512:	learn: 40.1083191	total: 172ms	remaining: 163ms
513:	learn: 40.0902370	total: 172ms	remaining: 163ms
514:	learn: 40.0753794	total: 172ms	remaining: 162ms
515:	learn: 40.0618336	total: 172ms	remaining: 162ms
516:	learn: 40.0426540	total: 173ms	remaining: 161ms
517:	learn: 40.0385259	total: 173ms	remaining: 161ms
518:	learn: 39.9944937	total: 173ms	remaining: 160ms
519:	learn: 39.9699110	total: 173ms	remaining: 160ms
520:	learn: 39.9252769	total: 174ms	remaining:

5.692433834075928: 100%|██████████| 200/200 [01:02<00:00,  3.20it/s] 


TreeFlowBoost(embedding_size=100,
              flow_model=<tfboost.flow.ContinuousNormalizingFlow object at 0x7fe9e0e32a50>,
              tree_model=<tfboost.tree.ecatboost.EmbeddableCatBoost object at 0x7fe9e0e32650>)

In [5]:
print("Train")
y_hat_train_tree = tfb.tree_model.predict(x_train)
print(mean_squared_error(y_train, y_hat_train_tree))

y_hat_train_tfb = tfb.predict(x_train, num_samples=50)
print(mean_squared_error(y_train, y_hat_train_tfb))

print("Test")
y_hat_test_tree = tfb.tree_model.predict(x_test)
print(mean_squared_error(y_test, y_hat_test_tree))

y_hat_test_tfb = tfb.predict(x_test, num_samples=50)
print(mean_squared_error(y_test, y_hat_test_tfb))

Train
947.2042611963451
6130.9361412945045
Test
2557.9949756826877
6154.22866811467


# Embeddable NGBoost - N(0, 1) prior

In [3]:
flow = ContinuousNormalizingFlow(build_model(
    input_dim=1,
    hidden_dims=(80, 40),
    context_dim=100,
    conditional=True,
))

tree = EmbeddableNGBoost()

tfb = TreeFlowBoost(flow_model=flow, tree_model = tree, embedding_size=100)

In [4]:
tfb.fit(x_train, y_train, n_epochs=200)

[iter 0] loss=5.7648 val_loss=0.0000 scale=1.0000 norm=65.9487
[iter 100] loss=5.3968 val_loss=0.0000 scale=1.0000 norm=44.9431
[iter 200] loss=5.1340 val_loss=0.0000 scale=2.0000 norm=71.2632
[iter 300] loss=4.9770 val_loss=0.0000 scale=1.0000 norm=31.9472
[iter 400] loss=4.8716 val_loss=0.0000 scale=1.0000 norm=29.6985


4.881969451904297: 100%|██████████| 200/200 [01:12<00:00,  2.76it/s] 


TreeFlowBoost(embedding_size=100,
              flow_model=<tfboost.flow.ContinuousNormalizingFlow object at 0x7f4f964e7650>,
              tree_model=<tfboost.tree.engboost.EmbeddableNGBoost object at 0x7f4f964e7250>)

In [5]:
print("Train")
y_hat_train_tree = tfb.tree_model.predict(x_train)
print(mean_squared_error(y_train, y_hat_train_tree))

y_hat_train_tfb = tfb.predict(x_train, num_samples=50)
print(mean_squared_error(y_train, y_hat_train_tfb))

print("Test")
y_hat_test_tree = tfb.tree_model.predict(x_test)
print(mean_squared_error(y_test, y_hat_test_tree))

y_hat_test_tfb = tfb.predict(x_test, num_samples=50)
print(mean_squared_error(y_test, y_hat_test_tfb))

Train
1170.4305421255933
1242.875211415677
Test
3571.737176190544
6231.042573464837


# Embeddable NGBoost - N(mu, sigma) prior

In [6]:
flow = ContinuousNormalizingFlow(build_model(
    input_dim=1,
    hidden_dims=(80, 40),
    context_dim=100,
    conditional=True,
))

tree = EmbeddableNGBoost2()

tfb = TreeFlowBoost(flow_model=flow, tree_model = tree, embedding_size=100)

In [7]:
tfb.fit(x, y, n_epochs=200)

[iter 0] loss=5.7628 val_loss=0.0000 scale=1.0000 norm=65.7709
[iter 100] loss=5.4201 val_loss=0.0000 scale=1.0000 norm=46.1239
[iter 200] loss=5.1934 val_loss=0.0000 scale=2.0000 norm=75.8693
[iter 300] loss=5.0698 val_loss=0.0000 scale=2.0000 norm=69.4080
[iter 400] loss=4.9891 val_loss=0.0000 scale=1.0000 norm=32.9669


19.550642013549805: 100%|██████████| 200/200 [02:12<00:00,  1.51it/s]


TreeFlowBoost(embedding_size=100,
              flow_model=<tfboost.flow.ContinuousNormalizingFlow object at 0x7f4f8ff13dd0>,
              tree_model=<tfboost.tree.engboost.EmbeddableNGBoost2 object at 0x7f4f8ff13350>)

In [8]:
print("Train")
y_hat_train_tree = tfb.tree_model.predict(x_train)
print(mean_squared_error(y_train, y_hat_train_tree))

y_hat_train_tfb = tfb.predict(x_train, num_samples=50)
print(mean_squared_error(y_train, y_hat_train_tfb))

print("Test")
y_hat_test_tree = tfb.tree_model.predict(x_test)
print(mean_squared_error(y_test, y_hat_test_tree))

y_hat_test_tfb = tfb.predict(x_test, num_samples=50)
print(mean_squared_error(y_test, y_hat_test_tfb))

Train
1496.8357786709573
23312660619.641136
Test
1598.5462321733182
23006777951.940598


# Embeddable NGBoost - Decision Path

In [9]:
flow = ContinuousNormalizingFlow(build_model(
    input_dim=1,
    hidden_dims=(80, 40),
    context_dim=100,
    conditional=True,
))

tree = EmbeddableNGBoostDecisionPath()

tfb = TreeFlowBoost(flow_model=flow, tree_model = tree, embedding_size=100)

In [10]:
tfb.fit(x, y, n_epochs=200)

[iter 0] loss=5.7628 val_loss=0.0000 scale=1.0000 norm=65.7709
[iter 100] loss=5.4201 val_loss=0.0000 scale=1.0000 norm=46.1239
[iter 200] loss=5.1934 val_loss=0.0000 scale=2.0000 norm=75.8693
[iter 300] loss=5.0698 val_loss=0.0000 scale=2.0000 norm=69.4080
[iter 400] loss=4.9891 val_loss=0.0000 scale=1.0000 norm=32.9669


5.684892654418945: 100%|██████████| 200/200 [01:12<00:00,  2.77it/s] 


TreeFlowBoost(embedding_size=100,
              flow_model=<tfboost.flow.ContinuousNormalizingFlow object at 0x7f4f8edde090>,
              tree_model=<tfboost.tree.engboost.EmbeddableNGBoostDecisionPath object at 0x7f4f8ed83bd0>)

In [11]:
print("Train")
y_hat_train_tree = tfb.tree_model.predict(x_train)
print(mean_squared_error(y_train, y_hat_train_tree))

y_hat_train_tfb = tfb.predict(x_train, num_samples=50)
print(mean_squared_error(y_train, y_hat_train_tfb))

print("Test")
y_hat_test_tree = tfb.tree_model.predict(x_test)
print(mean_squared_error(y_test, y_hat_test_tree))

y_hat_test_tfb = tfb.predict(x_test, num_samples=50)
print(mean_squared_error(y_test, y_hat_test_tfb))

Train
1496.8357786709573
5964.25567875597
Test
1598.546232173318
5961.48335143899
