In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable, Type
import abc

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torch.utils.data
from torch import Tensor
from sklearn.linear_model import RidgeClassifierCV
import xgboost as xgb

from models.ridge_ALOOCV import fit_ridge_ALOOCV
from models.sandwiched_least_squares import sandwiched_LS_dense, sandwiched_LS_diag, sandwiched_LS_scalar

In [2]:
# Make regression data X, y
N = 1000
N_test = 1000
D = 10
d = 3
X = torch.randn(N, D)
X_test = torch.randn(N_test, D)
w_true = torch.randn(D, d)
y = (X @ w_true)**2 + torch.randn(N, d) * 0.1  # Adding some noise
y_test = (X_test @ w_true)**2 + torch.randn(N_test, d) * 0.1  # Adding some noise

In [31]:
from models.models import RidgeCVModule

#dense      
model = RidgeCVModule(
        lower_alpha=1e-6,
        upper_alpha=1e6,
        n_alphas=10,
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(17.5370) std tensor(0.)
test rmse tensor(16.4296) std tensor(0.)
train tensor([17.5370, 17.5370, 17.5370, 17.5370, 17.5370, 17.5370, 17.5370, 17.5370,
        17.5370, 17.5370])
test tensor([16.4296, 16.4296, 16.4296, 16.4296, 16.4296, 16.4296, 16.4296, 16.4296,
        16.4296, 16.4296])


In [None]:
from models.models import GreedyRandFeatBoostRegression

#dense      
model = GreedyRandFeatBoostRegression(
     hidden_dim=128, 
     bottleneck_dim=128, 
     out_dim=d, 
     n_layers=5, 
     l2_reg=0.1, 
     feature_type="SWIM", 
     upscale="dense", 
     sandwich_solver="dense"
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(4.8036) std tensor(0.9972)
test rmse tensor(6.3538) std tensor(1.4504)
train tensor([4.2472, 5.7213, 3.8636, 5.3439, 6.8142, 4.6236, 5.3510, 4.5071, 3.5925,
        3.9714])
test tensor([5.6311, 7.1638, 4.7403, 6.8273, 9.6347, 6.3930, 6.7078, 6.2518, 4.4417,
        5.7469])


In [None]:
#diag
model = GreedyRandFeatBoostRegression(
     hidden_dim=512, 
     bottleneck_dim=512, 
     out_dim=d, 
     n_layers=3, 
     l2_reg=0.1, 
     feature_type="SWIM", 
     upscale="dense", 
     sandwich_solver="diag"
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(2.7414) std tensor(0.7641)
test rmse tensor(3.6085) std tensor(0.7143)
train tensor([2.5947, 2.1840, 3.1344, 2.2468, 2.1898, 3.9471, 2.1478, 1.7762, 3.4990,
        3.6942])
test tensor([3.5613, 2.9309, 4.1656, 3.0828, 3.1777, 4.7995, 2.9935, 2.8063, 4.3402,
        4.2271])


In [None]:
#scalar
model = GreedyRandFeatBoostRegression(
     hidden_dim=512, 
     bottleneck_dim=512, 
     out_dim=d, 
     n_layers=3, 
     l2_reg=0.0001, 
     feature_type="SWIM", 
     upscale="dense", 
     sandwich_solver="scalar"
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(3.1384) std tensor(3.2530)
test rmse tensor(4.0387) std tensor(2.9482)
train tensor([ 1.2765,  1.7262,  1.1376, 11.4514,  5.5821,  1.4210,  1.2141,  1.1346,
         3.2988,  3.1421])
test tensor([ 2.1791,  2.8977,  2.2073, 11.5325,  5.9572,  2.5953,  2.1592,  1.9598,
         4.4281,  4.4707])


In [None]:
from models.models import GradientRandFeatBoostRegression
        
model = GradientRandFeatBoostRegression(
     hidden_dim=32, 
     bottleneck_dim=512, 
     out_dim=d, 
     n_layers=5, 
     feature_type="SWIM", 
     upscale="dense", 
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(3.5296) std tensor(0.4859)
test rmse tensor(5.3978) std tensor(0.6923)
train tensor([4.6294, 3.4208, 3.5516, 2.9593, 3.5966, 3.9041, 3.6188, 3.0283, 3.4702,
        3.1173])
test tensor([6.8492, 5.0806, 5.6701, 4.6747, 5.6699, 5.6498, 5.9215, 4.5661, 4.9716,
        4.9247])


# End2End

In [None]:
from models.models import End2EndMLPResNet

model = End2EndMLPResNet(
        in_dim=D,
        hidden_dim=128,
        bottleneck_dim=32,
        out_dim=d,
        n_blocks=3,
        loss = "mse",
        lr = 0.1,
        n_epochs = 30,
        end_lr_factor= 0.1,
        weight_decay = 0.001,
        batch_size = 64,
        )

results = []
for i in range(5):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

100%|██████████| 30/30 [00:02<00:00, 11.93it/s]
100%|██████████| 30/30 [00:02<00:00, 12.71it/s]
100%|██████████| 30/30 [00:02<00:00, 12.49it/s]
100%|██████████| 30/30 [00:02<00:00, 12.47it/s]
100%|██████████| 30/30 [00:02<00:00, 11.87it/s]

train rmse tensor(2.6174) std tensor(1.8957)
test rmse tensor(2.8031) std tensor(1.5548)
train tensor([1.1134, 4.5657, 1.3029, 4.8146, 1.2905])
test tensor([1.6028, 3.8519, 1.7320, 5.0321, 1.7966])





# Next model f(x_t, x_0)

In [47]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable, Type
import abc

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torch.utils.data
from torch import Tensor

from models.ridge_ALOOCV import fit_ridge_ALOOCV
from models.sandwiched_least_squares import sandwiched_LS_dense, sandwiched_LS_diag, sandwiched_LS_scalar
from models.models import FittableModule, create_layer


class GradientRandFeatBoostRegression_fxtx0(FittableModule):
    def __init__(self, 
                 hidden_dim: int = 128,
                 bottleneck_dim: int = 128,
                 out_dim: int = 1,
                 n_layers: int = 5,
                 activation: nn.Module = nn.Tanh(),
                #  l2_reg: float = 0.01,   #TODO ALOOCV or fixed l2_reg
                 feature_type = "SWIM", # "dense", identity
                 boost_lr: float = 1.0,
                 upscale: Optional[str] = "dense",
                 concat_phi_t_x0: bool = False,
                 ):
        super(GradientRandFeatBoostRegression_fxtx0, self).__init__()
        self.hidden_dim = hidden_dim
        self.bottleneck_dim = bottleneck_dim
        self.out_dim = out_dim
        self.n_layers = n_layers
        self.activation = activation
        # self.l2_reg = l2_reg
        self.feature_type = feature_type
        self.boost_lr = boost_lr
        self.upscale = upscale
        self.concat_phi_t_x0 = concat_phi_t_x0


    def fit(self, X: Tensor, y: Tensor):
        in_dim = X.shape[1]
        with torch.no_grad():
            X0 = X
            #optional upscale
            if self.upscale == "dense":
                self.upscale_fun = create_layer(self.upscale, X.shape[1], self.hidden_dim, None)
                X = self.upscale_fun.fit_transform(X, y)
            elif self.upscale == "SWIM":
                self.upscale_fun = create_layer(self.upscale, X.shape[1], self.hidden_dim, self.activation)
                X = self.upscale_fun.fit_transform(X, y)

            # Create regressor W_0
            self.W, self.b, _ = fit_ridge_ALOOCV(X, y)
            self.layers = []
            self.deltas = []

            # Layerwise boosting
            N = X.size(0)
            for t in range(self.n_layers):
                Xt0 = X
                #Step 0: do we want f(x_t) or f(x_t, x_0)?
                if self.concat_phi_t_x0:
                    X = torch.cat([X, X0], dim=1) 
                    X = (X-X.mean(dim=0, keepdim=True)) / torch.std(X, dim=0, keepdim=True) #TODO keep normalization?
                else:
                    in_dim = 0 # hack

                # Step 1: Create random feature layer
                layer = create_layer(self.feature_type, self.hidden_dim+in_dim, self.bottleneck_dim, self.activation)
                F = layer.fit_transform(X, y)

                # Step 2: Obtain activation gradient and learn Delta
                # X shape (N, D) --- ResNet neurons
                # F shape (N, p) --- random features
                # y shape (N, d) --- target
                # W shape (D, d) --- top level classifier
                # G shape (N, D) --- gradient of neurons
                # r shape (N, d) --- residual at currect boosting iteration

                r = y - Xt0 @ self.W - self.b
                G = r @ self.W.T
                
                # fit to negative gradient (finding functional direction)
                Delta, Delta_b, _ = fit_ridge_ALOOCV(F, G)
                Ghat = F @ Delta + Delta_b

                # Line search closed form risk minimization of R(W_t, Phi_{t+1})
                linesearch = sandwiched_LS_scalar(r, self.W, Ghat, 0.00001)


                # Step 3: Learn top level classifier
                X = Xt0 + self.boost_lr * linesearch * Ghat
                self.W, self.b, _ = fit_ridge_ALOOCV(X, y)

                #update Delta scale
                Delta = Delta * linesearch
                Delta_b = Delta_b * linesearch

                # store
                self.layers.append(layer)
                self.deltas.append((Delta, Delta_b))

            return X @ self.W + self.b
        

    def forward(self, X: Tensor) -> Tensor:
        with torch.no_grad():
            X0 = X
            if self.upscale is not None:
                X = self.upscale_fun(X)
            for layer, (Delta, Delta_b) in zip(self.layers, self.deltas):
                Xt0 = X
                if self.concat_phi_t_x0:
                    X = torch.cat([X, X0], dim=1) 
                    X = (X-X.mean(dim=0, keepdim=True)) / torch.std(X, dim=0, keepdim=True) #TODO keep normalization?
                X = Xt0 + self.boost_lr * (layer(X) @ Delta + Delta_b)
            return X @ self.W + self.b
        

model = GradientRandFeatBoostRegression_fxtx0(
     hidden_dim=32, 
     bottleneck_dim=1000, 
     out_dim=d, 
     n_layers=10, 
     feature_type="SWIM", 
     upscale="dense",
     concat_phi_t_x0=True,
     boost_lr=1.0
     )

results = []
for i in range(5):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(1.3204) std tensor(0.4021)
test rmse tensor(2.7518) std tensor(0.5856)
train tensor([1.7848, 1.7316, 1.0005, 1.0945, 0.9907])
test tensor([3.3208, 3.4563, 2.3362, 2.2485, 2.3973])


# another option: concat[f(x_t), h(x_0)]. Do this next

In [9]:
# TODO batch normalization??? Wolfe-Franke???

In [119]:
class CONCATFEATGradientRandFeatBoostRegression_fxtx0(FittableModule):
    def __init__(self, 
                 hidden_dim: int = 128,
                 randfeat_xt_dim: int = 128,
                 randfeat_x0_dim: int = 128,
                 out_dim: int = 1,
                 n_layers: int = 5,
                 activation: nn.Module = nn.Tanh(),
                 #  l2_reg: float = 0.01,   #TODO ALOOCV or fixed l2_reg
                 feature_type = "SWIM", # "dense", identity
                 boost_lr: float = 1.0,
                 upscale: Optional[str] = "dense",
                 ):
        super(CONCATFEATGradientRandFeatBoostRegression_fxtx0, self).__init__()
        self.hidden_dim = hidden_dim
        self.randfeat_xt_dim = randfeat_xt_dim
        self.randfeat_x0_dim = randfeat_x0_dim
        self.out_dim = out_dim
        self.n_layers = n_layers
        self.activation = activation
        # self.l2_reg = l2_reg
        self.feature_type = feature_type
        self.boost_lr = boost_lr
        self.upscale = upscale


    def fit(self, X: Tensor, y: Tensor):
        with torch.no_grad():
            X0 = X
            #optional upscale
            if self.upscale == "dense":
                self.upscale_fun = create_layer(self.upscale, X.shape[1], self.hidden_dim, None)
                X = self.upscale_fun.fit_transform(X, y)
            elif self.upscale == "SWIM":
                self.upscale_fun = create_layer(self.upscale, X.shape[1], self.hidden_dim, self.activation)
                X = self.upscale_fun.fit_transform(X, y)

            # Create regressor W_0
            W, b, _ = fit_ridge_ALOOCV(X, y)
            self.layers_fxt = []
            self.layers_fx0 = []
            self.deltas = []
            self.Ws = [W]
            self.bs = [b]

            # Layerwise boosting
            N = X.size(0)
            for t in range(self.n_layers):
                # X0 shape (N, raw_in_dim) --- raw input data
                # X  shape (N, D) --- ResNet neurons
                # F  shape (N, p) --- random features
                # y  shape (N, d) --- target data
                # W  shape (D, d) --- top level classifier
                # G  shape (N, D) --- gradient of neurons
                # r  shape (N, d) --- residual at currect boosting iteration
                # Delta shape (p, D) --- random feature weights

                # Step 1: Create random feature layer
                fxt_fun = create_layer(self.feature_type, self.hidden_dim, self.randfeat_xt_dim, self.activation)
                fx0_fun = create_layer(self.feature_type, X0.size(1), self.randfeat_x0_dim, self.activation)
                Fxt = fxt_fun.fit_transform(X, y)
                Fx0 = fx0_fun.fit_transform(X0, y)
                F = torch.cat([Fxt, Fx0], dim=1)

                # Step 2: Obtain activation gradient and learn Delta
                r = y - X @ W - b
                G = r @ W.T
                Gnorm = torch.norm(G)
                # fit to negative gradient (finding functional direction)
                Delta, Delta_b, _ = fit_ridge_ALOOCV(F, G)
                print("alpha", _)
                Ghat = (F @ Delta + Delta_b)
                # line search closed form risk minimization of R(W_t, Phi_{t+1})
                linesearch = sandwiched_LS_scalar(r, W, Ghat, 0.0001)
                print("linesearch", linesearch)


                # Step 3: Learn top level classifier
                X = X + self.boost_lr * linesearch * Ghat
                W, b, _ = fit_ridge_ALOOCV(X, y)
                # update Delta magnitude
                Delta = Delta * linesearch
                Delta_b = Delta_b * linesearch
                # store
                self.layers_fxt.append(fxt_fun)
                self.layers_fx0.append(fx0_fun)
                self.deltas.append((Delta, Delta_b))
                self.Ws.append(W)
                self.bs.append(b)

            return X @ W + b
        

    def forward(self, X: Tensor) -> Tensor:
        with torch.no_grad():
            X0 = X
            if self.upscale is not None:
                X = self.upscale_fun(X)
            for fxt_fun, fx0_fun, (Delta, Delta_b) in zip(self.layers_fxt, self.layers_fx0, self.deltas):
                features = torch.cat([fxt_fun(X), fx0_fun(X0)], dim=1)
                X = X + self.boost_lr * (features @ Delta + Delta_b)
            return X @ self.Ws[-1] + self.bs[-1]


model = CONCATFEATGradientRandFeatBoostRegression_fxtx0(
     hidden_dim=32, 
     randfeat_xt_dim = 32,
     randfeat_x0_dim = 1028,
     out_dim=d, 
     n_layers=20, 
     feature_type="SWIM", 
     upscale="dense",
     boost_lr=1.0
     )

results = []
for i in range(5):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

alpha 0.009999999776482582
linesearch tensor(0.2441)
alpha 0.009999999776482582
linesearch tensor(3.3613e-10)
alpha 0.0010000000474974513
linesearch tensor(5.0127e-05)
alpha 10.0
linesearch tensor(6.8426e-08)
alpha 10.0
linesearch tensor(1.6636e-07)
alpha 10.0
linesearch tensor(1.4780e-06)
alpha 10.0
linesearch tensor(5.4926e-06)
alpha 10.0
linesearch tensor(1.7899e-07)
alpha 10.0
linesearch tensor(4.7869e-06)
alpha 10.0
linesearch tensor(6.2879e-06)
alpha 10.0
linesearch tensor(7.5734e-08)
alpha 10.0
linesearch tensor(3.7390e-06)
alpha 10.0
linesearch tensor(2.2127e-07)
alpha 10.0
linesearch tensor(1.1747e-07)
alpha 10.0
linesearch tensor(1.3316e-06)
alpha 10.0
linesearch tensor(7.7201e-06)
alpha 10.0
linesearch tensor(1.2717e-06)
alpha 10.0
linesearch tensor(3.9584e-07)
alpha 10.0
linesearch tensor(1.3475e-06)
alpha 10.0
linesearch tensor(1.0122e-05)
alpha 0.009999999776482582
linesearch tensor(0.2619)
alpha 0.10000000149011612
linesearch tensor(0.0004)
alpha 10.0
linesearch tensor(1

In [None]:
# train rmse tensor(0.9585) std tensor(0.1429)
# test rmse tensor(1.9235) std tensor(0.1405)
# train tensor([0.9936, 1.1145, 0.7242, 0.9691, 0.9910])
# test tensor([2.0114, 2.0840, 1.7119, 1.9109, 1.8996])