In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable, Type
import abc

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torch.utils.data
from torch import Tensor
from sklearn.linear_model import RidgeClassifierCV
import xgboost as xgb

from models.ridge_ALOOCV import fit_ridge_ALOOCV
from models.sandwiched_least_squares import sandwiched_LS_dense, sandwiched_LS_diag, sandwiched_LS_scalar

In [2]:
# Make regression data X, y
N = 1000
N_test = 1000
D = 10
d = 3
X = torch.randn(N, D)
X_test = torch.randn(N_test, D)
w_true = torch.randn(D, d)
y = (X @ w_true)**2 + torch.randn(N, d) * 0.1  # Adding some noise
y_test = (X_test @ w_true)**2 + torch.randn(N_test, d) * 0.1  # Adding some noise

In [3]:
from models.models import RidgeCVModule

#dense      
model = RidgeCVModule(
        lower_alpha=1e-6,
        upper_alpha=1e6,
        n_alphas=10,
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(8.6430) std tensor(0.)
test rmse tensor(9.0828) std tensor(0.)
train tensor([8.6430, 8.6430, 8.6430, 8.6430, 8.6430, 8.6430, 8.6430, 8.6430, 8.6430,
        8.6430])
test tensor([9.0828, 9.0828, 9.0828, 9.0828, 9.0828, 9.0828, 9.0828, 9.0828, 9.0828,
        9.0828])


In [4]:
from models.models import GreedyRandFeatBoostRegression

#dense      
model = GreedyRandFeatBoostRegression(
     hidden_dim=128, 
     bottleneck_dim=128, 
     out_dim=d, 
     n_layers=5, 
     l2_reg=0.1, 
     feature_type="SWIM", 
     upscale="dense", 
     sandwich_solver="dense"
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(2.3312) std tensor(0.5462)
test rmse tensor(3.3370) std tensor(0.6041)
train tensor([2.5459, 1.9770, 2.1338, 3.6567, 1.9788, 2.5264, 2.1827, 1.9025, 2.6022,
        1.8058])
test tensor([3.7197, 2.8087, 2.9656, 4.6065, 2.9207, 3.3118, 3.1447, 3.0470, 4.0548,
        2.7906])


In [5]:
#diag
model = GreedyRandFeatBoostRegression(
     hidden_dim=512, 
     bottleneck_dim=512, 
     out_dim=d, 
     n_layers=3, 
     l2_reg=0.1, 
     feature_type="SWIM", 
     upscale="dense", 
     sandwich_solver="diag"
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(1.8218) std tensor(0.5824)
test rmse tensor(2.4492) std tensor(0.5188)
train tensor([1.3506, 1.9186, 1.5029, 1.6868, 1.7180, 1.5808, 1.7199, 3.3880, 1.9279,
        1.4242])
test tensor([1.9153, 2.5489, 2.1412, 2.3750, 2.3784, 2.3791, 2.3258, 3.8210, 2.5010,
        2.1065])


In [6]:
#scalar
model = GreedyRandFeatBoostRegression(
     hidden_dim=512, 
     bottleneck_dim=512, 
     out_dim=d, 
     n_layers=3, 
     l2_reg=0.0001, 
     feature_type="SWIM", 
     upscale="dense", 
     sandwich_solver="scalar"
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(2.3644) std tensor(2.2127)
test rmse tensor(3.0741) std tensor(2.2342)
train tensor([2.1745, 0.8754, 5.7361, 0.6286, 6.2912, 0.9110, 0.6166, 1.4423, 4.2110,
        0.7568])
test tensor([3.1005, 1.5892, 6.5635, 1.3099, 6.9387, 1.5568, 1.3191, 2.1315, 4.8798,
        1.3522])


In [7]:
from models.models import GradientRandFeatBoostRegression
        
model = GradientRandFeatBoostRegression(
     hidden_dim=32, 
     bottleneck_dim=512, 
     out_dim=d, 
     n_layers=5, 
     feature_type="SWIM", 
     upscale="dense", 
     )

results = []
for i in range(10):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(1.6803) std tensor(0.4142)
test rmse tensor(2.7276) std tensor(0.4814)
train tensor([1.8557, 1.3188, 1.9709, 2.5846, 1.5602, 1.6036, 1.9472, 1.3233, 1.3398,
        1.2990])
test tensor([2.8761, 2.1755, 2.8845, 3.6712, 2.7031, 2.6914, 3.0732, 2.1563, 2.9168,
        2.1276])


# End2End

In [8]:
from models.models import End2EndMLPResNet

model = End2EndMLPResNet(
        in_dim=D,
        hidden_dim=128,
        bottleneck_dim=32,
        out_dim=d,
        n_blocks=3,
        loss = "mse",
        lr = 0.1,
        n_epochs = 30,
        end_lr_factor= 0.1,
        weight_decay = 0.001,
        batch_size = 64,
        )

results = []
for i in range(5):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

100%|██████████| 30/30 [00:02<00:00, 14.57it/s]
100%|██████████| 30/30 [00:02<00:00, 14.31it/s]
100%|██████████| 30/30 [00:02<00:00, 13.12it/s]
100%|██████████| 30/30 [00:02<00:00, 13.97it/s]
100%|██████████| 30/30 [00:02<00:00, 14.81it/s]

train rmse tensor(1.4970) std tensor(1.2154)
test rmse tensor(2.3131) std tensor(0.9893)
train tensor([0.5836, 2.8102, 0.6247, 2.8462, 0.6202])
test tensor([1.4892, 3.4227, 1.7266, 3.3623, 1.5646])





# Next model f(x_t, x_0)

In [9]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable, Type
import abc

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torch.utils.data
from torch import Tensor

from models.ridge_ALOOCV import fit_ridge_ALOOCV
from models.sandwiched_least_squares import sandwiched_LS_dense, sandwiched_LS_diag, sandwiched_LS_scalar
from models.models import FittableModule, create_layer


class GradientRandFeatBoostRegression_fxtx0(FittableModule):
    def __init__(self, 
                 hidden_dim: int = 128,
                 bottleneck_dim: int = 128,
                 out_dim: int = 1,
                 n_layers: int = 5,
                 activation: nn.Module = nn.Tanh(),
                #  l2_reg: float = 0.01,   #TODO ALOOCV or fixed l2_reg
                 feature_type = "SWIM", # "dense", identity
                 boost_lr: float = 1.0,
                 upscale: Optional[str] = "dense",
                 concat_phi_t_x0: bool = False,
                 ):
        super(GradientRandFeatBoostRegression_fxtx0, self).__init__()
        self.hidden_dim = hidden_dim
        self.bottleneck_dim = bottleneck_dim
        self.out_dim = out_dim
        self.n_layers = n_layers
        self.activation = activation
        # self.l2_reg = l2_reg
        self.feature_type = feature_type
        self.boost_lr = boost_lr
        self.upscale = upscale
        self.concat_phi_t_x0 = concat_phi_t_x0


    def fit(self, X: Tensor, y: Tensor):
        in_dim = X.shape[1]
        with torch.no_grad():
            X0 = X
            #optional upscale
            if self.upscale == "dense":
                self.upscale_fun = create_layer(self.upscale, X.shape[1], self.hidden_dim, None)
                X = self.upscale_fun.fit_transform(X, y)
            elif self.upscale == "SWIM":
                self.upscale_fun = create_layer(self.upscale, X.shape[1], self.hidden_dim, self.activation)
                X = self.upscale_fun.fit_transform(X, y)

            # Create regressor W_0
            self.W, self.b, _ = fit_ridge_ALOOCV(X, y)
            self.layers = []
            self.deltas = []

            # Layerwise boosting
            N = X.size(0)
            for t in range(self.n_layers):
                Xt0 = X
                #Step 0: do we want f(x_t) or f(x_t, x_0)?
                if self.concat_phi_t_x0:
                    X = torch.cat([X, X0], dim=1) 
                    X = (X-X.mean(dim=0, keepdim=True)) / torch.std(X, dim=0, keepdim=True) #TODO keep normalization?
                else:
                    in_dim = 0 # hack

                # Step 1: Create random feature layer
                layer = create_layer(self.feature_type, self.hidden_dim+in_dim, self.bottleneck_dim, self.activation)
                F = layer.fit_transform(X, y)

                # Step 2: Obtain activation gradient and learn Delta
                # X shape (N, D) --- ResNet neurons
                # F shape (N, p) --- random features
                # y shape (N, d) --- target
                # W shape (D, d) --- top level classifier
                # G shape (N, D) --- gradient of neurons
                # r shape (N, d) --- residual at currect boosting iteration

                r = y - Xt0 @ self.W - self.b
                G = r @ self.W.T
                
                # fit to negative gradient (finding functional direction)
                Delta, Delta_b, _ = fit_ridge_ALOOCV(F, G)
                Ghat = F @ Delta + Delta_b

                # Line search closed form risk minimization of R(W_t, Phi_{t+1})
                linesearch = sandwiched_LS_scalar(r, self.W, Ghat, 0.00001)


                # Step 3: Learn top level classifier
                X = Xt0 + self.boost_lr * linesearch * Ghat
                self.W, self.b, _ = fit_ridge_ALOOCV(X, y)

                #update Delta scale
                Delta = Delta * linesearch
                Delta_b = Delta_b * linesearch

                # store
                self.layers.append(layer)
                self.deltas.append((Delta, Delta_b))

            return X @ self.W + self.b
        

    def forward(self, X: Tensor) -> Tensor:
        with torch.no_grad():
            X0 = X
            if self.upscale is not None:
                X = self.upscale_fun(X)
            for layer, (Delta, Delta_b) in zip(self.layers, self.deltas):
                Xt0 = X
                if self.concat_phi_t_x0:
                    X = torch.cat([X, X0], dim=1) 
                    X = (X-X.mean(dim=0, keepdim=True)) / torch.std(X, dim=0, keepdim=True) #TODO keep normalization?
                X = Xt0 + self.boost_lr * (layer(X) @ Delta + Delta_b)
            return X @ self.W + self.b
        

model = GradientRandFeatBoostRegression_fxtx0(
     hidden_dim=32, 
     bottleneck_dim=1000, 
     out_dim=d, 
     n_layers=10, 
     feature_type="SWIM", 
     upscale="dense",
     concat_phi_t_x0=True,
     boost_lr=1.0
     )

results = []
for i in range(5):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

train rmse tensor(0.5150) std tensor(0.0308)
test rmse tensor(1.6631) std tensor(0.0689)
train tensor([0.4694, 0.5369, 0.4967, 0.5335, 0.5384])
test tensor([1.6296, 1.7383, 1.5621, 1.6821, 1.7036])


# another option: concat[f(x_t), h(x_0)]. Do this next

In [10]:
# TODO batch normalization??? Wolfe-Franke???

In [11]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable, Type
import abc

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torch.utils.data
from torch import Tensor
from sklearn.linear_model import RidgeClassifierCV
import xgboost as xgb

from models.ridge_ALOOCV import fit_ridge_ALOOCV
from models.sandwiched_least_squares import sandwiched_LS_dense, sandwiched_LS_diag, sandwiched_LS_scalar

X_testing = torch.randn(100, 128)
test = nn.Linear(128, 0)

out = test(X_testing)



In [None]:
from models.models import GradientRandFeatBoostReg

model = GradientRandFeatBoostReg(
     hidden_dim=32, 
     randfeat_xt_dim = 128,
     randfeat_x0_dim = 512,
     out_dim=d, 
     n_layers=5, 
     feature_type="dense", 
     upscale="dense",
     boost_lr=1.0
     )

results = []
for i in range(9):
    model.fit(X, y)
    out = model(X)
    out_test = model(X_test)
    rmse = torch.sqrt(F.mse_loss(out, y))
    rmse_test = torch.sqrt(F.mse_loss(out_test, y_test))
    results.append(torch.tensor([rmse, rmse_test]))
results = torch.stack(results)
print("train rmse", results[:, 0].mean(), "std", results[:, 0].std())
print("test rmse", results[:, 1].mean(), "std", results[:, 1].std())
print("train", results[:, 0])
print("test", results[:, 1])

torch.Size([1000, 10])
torch.Size([1000, 3])
train rmse tensor(0.9708) std tensor(0.3264)
test rmse tensor(3.2590) std tensor(0.1887)
train tensor([0.8672, 0.7679, 0.6892, 0.8056, 0.8406, 1.1227, 1.7298, 1.1538, 0.7605])
test tensor([3.3239, 3.1751, 3.0082, 3.1010, 3.1505, 3.2738, 3.5422, 3.5585, 3.1980])


# TODO make the GreedyRandFeatReg also use both xtx0. (here it almost makes more sense to concat f([x0,xt]))