In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeClassifierCVModule, E2EResNet, LogisticRegressionSGD, RandFeatBoost

np.set_printoptions(precision=3, threshold=5) # Print options
device = "cuda" #torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# MNIST

In [2]:
from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download and load the training data
mnist_path = "/home/nikita/hdd/MNIST"
trainset = datasets.MNIST(mnist_path, download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=len(trainset), shuffle=False)

# Download and load the test data
testset = datasets.MNIST(mnist_path, download=True, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=len(testset), shuffle=False)

# Flatten the data
X_train, y_train_cat = next(iter(trainloader))
X_train = X_train.view(len(trainset), -1).to(device)
X_test, y_test_cat = next(iter(testloader))
X_test = X_test.view(len(testset), -1).to(device)

# Convert train and test labels to one-hot encoding
y_train = F.one_hot(y_train_cat, num_classes=10).float().to(device)
y_test = F.one_hot(y_test_cat, num_classes=10).float().to(device)
y_train_cat = y_train_cat.to(device)
y_test_cat = y_test_cat.to(device)

# Normalize by mean and std
X_train, X_test = normalize_mean_std_traindata(X_train, X_test)
print(f"Train data shape: {X_train.shape}")
print(f"Train labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

Train data shape: torch.Size([60000, 784])
Train labels shape: torch.Size([60000, 10])
Test data shape: torch.Size([10000, 784])
Test labels shape: torch.Size([10000, 10])


# Logistic Regression

In [8]:
from models import FittableModule, Dense, kaiming_normal_with_generator

class LogisticRegression(FittableModule):
    def __init__(self, 
                 generator: torch.Generator,
                 in_dim: int = 784,
                 out_dim: int = 10,
                 l2_reg: float = 1.0,
                 lr: float = 1.0,
                 ):
        super(LogisticRegression, self).__init__()
        self.generator = generator
        self.linear = nn.Linear(in_dim, out_dim)
        self.l2_reg = l2_reg
        self.lr = lr

        if out_dim > 1:
            self.loss = F.cross_entropy #this is with logits
        else:
            self.loss = F.binary_cross_entropy_with_logits


    def fit(self, 
            X: Tensor, 
            y: Tensor,
            init_W_b: Optional[Tuple[Tensor, Tensor]] = None,
            ) -> Tuple[Tensor, Tensor]:
        
        # No onehot encoding
        if y.dim() > 1:
            y_labels = torch.argmax(y, dim=1)
        else:
            y_labels = y

        # Put model on device
        device = X.device
        self.to(device)

        # Initialize weights and bias
        if init_W_b is not None:
            W, b = init_W_b
            self.linear.weight = W
            self.linear.bias = b
        else:
            kaiming_normal_with_generator(self.linear.weight, self.generator)
            nn.init.zeros_(self.linear.bias)
        
        with torch.enable_grad():
            # Optimize
            optimizer = torch.optim.LBFGS(self.linear.parameters(), lr=self.lr)
            def closure():
                optimizer.zero_grad()
                logits = self.linear(X)
                loss = self.loss(logits, y_labels)
                loss += self.l2_reg * torch.linalg.norm(self.linear.weight)**2
                loss.backward()
                print(loss)
                return loss
            optimizer.step(closure)
        return self(X), y

    def forward(self, X: Tensor) -> Tensor:
        return self.linear(X)


N = 100
D = 50
C = 10
gen = torch.Generator().manual_seed(42)
X = torch.randn(N, D, generator=gen)
y = torch.randint(0, C, size=(N,), generator=gen)
model = LogisticRegression(
        gen,
        in_dim = D,
        out_dim = C,
        l2_reg = 1.0,
    )
_, _ = model.fit(X, y)

tensor(24.9071, grad_fn=<AddBackward0>)
tensor(24.3590, grad_fn=<AddBackward0>)
tensor(2.1914, grad_fn=<AddBackward0>)
tensor(2.1779, grad_fn=<AddBackward0>)
tensor(2.1737, grad_fn=<AddBackward0>)
tensor(2.1485, grad_fn=<AddBackward0>)
tensor(2.1441, grad_fn=<AddBackward0>)
tensor(2.1407, grad_fn=<AddBackward0>)
tensor(2.1404, grad_fn=<AddBackward0>)
tensor(2.1402, grad_fn=<AddBackward0>)
tensor(2.1398, grad_fn=<AddBackward0>)
tensor(2.1396, grad_fn=<AddBackward0>)
tensor(2.1394, grad_fn=<AddBackward0>)
tensor(2.1394, grad_fn=<AddBackward0>)
tensor(2.1394, grad_fn=<AddBackward0>)
tensor(2.1394, grad_fn=<AddBackward0>)
tensor(2.1394, grad_fn=<AddBackward0>)
tensor(2.1394, grad_fn=<AddBackward0>)
tensor(2.1394, grad_fn=<AddBackward0>)
tensor(2.1394, grad_fn=<AddBackward0>)


# Gradient Random Feature Boosting

In [None]:
from models import FittableModule, create_layer
from ridge_ALOOCV import fit_ridge_ALOOCV
    

def fit_logistic_regression(
        X: Tensor, 
        y: Tensor, 
        l2_reg: float = 1,
        init_W_b: Optional[Tuple[Tensor, Tensor]] = None,
    ):
    """Uses L-BFGS to fit a logistic regression model to the data."""
    with torch.enable_grad():
        # Initialize weights
        if init_W_b is None:
            W = torch.randn(X.shape[1], y.shape[1], requires_grad=True, device=X.device)
            b = torch.randn(y.shape[1], requires_grad=True, device=X.device)
        else:
            W, b = init_W_b
        
        # Define loss function
        def loss_fn(W, b):
            logits = X @ W + b
            loss = F.cross_entropy_with_logits(logits, y)
            return loss
        
        # Optimize
        optimizer = torch.optim.LBFGS([W, b], lr=0.01)
        def closure():
            optimizer.zero_grad()
            loss = loss_fn(W, b)
            loss.backward()
            return loss
        optimizer.step(closure)
    
    return W, b




class GradientRandFeatBoost(FittableModule):
    def __init__(self, 
                 generator: torch.Generator, 
                 hidden_dim: int = 128, # TODO
                 bottleneck_dim: int = 128,
                 out_dim: int = 1,
                 n_layers: int = 5,
                 activation: nn.Module = nn.Tanh(),
                 l2_reg: float = 0.01,
                 feature_type = "SWIM", # "dense", identity
                 boost_lr: float = 1.0,
                 upscale: Optional[str] = "dense",
                 ):
        super(GradientRandFeatBoost, self).__init__()
        self.generator = generator
        self.hidden_dim = hidden_dim
        self.bottleneck_dim = bottleneck_dim
        self.out_dim = out_dim
        self.n_layers = n_layers
        self.activation = activation
        self.l2_reg = l2_reg
        self.feature_type = feature_type
        self.boost_lr = boost_lr
        self.upscale = upscale

        # save for now. for more memory efficient implementation, we can remove a lot of this
        self.W = []
        self.b = []
        self.alphas = []
        self.layers = []
        self.deltas = []


    def fit(self, X: Tensor, y: Tensor):
        # with torch.no_grad():

        #optional upscale
        if self.upscale == "dense":
            self.upscale = create_layer(self.generator, self.upscale, X.shape[1], self.hidden_dim, None)
            X, y = self.upscale.fit(X, y)
        elif self.upscale == "SWIM":
            self.upscale = create_layer(self.generator, self.upscale, X.shape[1], self.hidden_dim, self.activation)
            X, y = self.upscale.fit(X, y)

        # Create classifier W_0
        W, b = fit_logistic_regression(X, y, self.l2_reg, init_W_b=None)

        # Layerwise boosting
        for t in range(self.n_layers):
            # Step 1: Create random feature layer   
            layer = create_layer(self.generator, self.feature_type, self.hidden_dim, self.bottleneck_dim, self.activation)
            F, y = layer.fit(X, y)

            # Step 2: Obtain activation gradient
            # X shape (N, D) --- ResNet neurons
            # F shape (N, p) --- random features
            # y shape (N, d) --- target
            # r shape (N, D) --- residual at currect boosting iteration
            # W shape (D, d) --- top level classifier
            r = y - X @ W - b   # G = (y - X @ W - b) @ W.T TODO TODO TODO CLASSFICATION
            SW, U = torch.linalg.eigh(W @ W.T)
            SF, V = torch.linalg.eigh(F.T @ F)
            Delta = (U.T @ W @ r.T @ F @ V) / (N*self.l2_reg + SW[:, None]*SF[None, :])
            Delta = (U @ Delta @ V.T).T
            #TODO de-center F and r, and include an intercept. How to do this for my special equation?

            # Step 3: Learn top level classifier
            X = X + self.boost_lr * F @ Delta
            W, b, alpha = fit_ridge_ALOOCV(X, y)

            # store
            self.layers.append(layer)
            self.deltas.append(Delta)
            self.W.append(W)
            self.b.append(b)
            self.alphas.append(alpha)

        return X @ W + b, y


    def forward(self, X: Tensor) -> Tensor:
        with torch.no_grad():
            if self.upscale is not None:
                X = self.upscale(X)
            for layer, Delta in zip(self.layers, self.deltas):
                X = X + self.boost_lr * layer(X) @ Delta
            return X @ self.W[-1] + self.b[-1]
        


N = 100
D = 50
C = 10
bottleneck_dim = 70
gen = torch.Generator().manual_seed(42)
X = torch.randn(N, D, generator=gen)
y = torch.randint(0, C, size=(N,), generator=gen)
model = GradientRandFeatBoost(
        gen,
        hidden_dim = D,
        bottleneck_dim = bottleneck_dim,
        out_dim = C,
        n_layers = 1,
        upscale = "dense",
        feature_type = "dense",
    )
_, _ = model.fit(X, y)

# experiments

In [22]:
def run_allmodels_1dataset(
        generator: torch.Generator,
        X_train: Tensor,
        y_train: Tensor,
        X_test: Tensor,
        y_test: Tensor,
        ):
    
    D = X_train.shape[1]
    hidden_size = 512
    bottleneck_dim = 1*hidden_size
    num_epochs = 50
    batch_size = 128

    # (name, model, kwargs). kwargs separate to save memory
    model_list = [
        # ["T=10 RandFeatureBoost", RandFeatBoost,
        #         {"generator": generator,
        #          "in_dim": D,
        #          "hidden_size": hidden_size,
        #          "out_dim": 10,
        #          "n_blocks": 9,
        #          "activation": nn.Tanh(),
        #          "loss_fn": nn.CrossEntropyLoss(),
        #          "adam_lr": 1e-2,
        #          "boost_lr": 1.0,
        #          "epochs": num_epochs,
        #          "batch_size": batch_size,
        #          "upscale_type": "SWIM",  # "dense", "identity"
        #          }],

        # ["Tabular Ridge", RidgeClassifierCVModule, {}],

        # ["Logistic SGD", LogisticRegressionSGD, 
        #         {"generator": generator,
        #          "num_epochs": num_epochs,
        #          "batch_size": batch_size,
        #          }],

        # ["Logistic L-BFSG", LogisticRegression, 
        #         {"generator": generator,
        #          "in_dim": D,
        #          "out_dim": 10,
        #          }],

        # ["T=1 Dense", ResNet,
        #         {"generator": generator,
        #          "in_dim": D,
        #          "hidden_size": hidden_size,
        #          "bottleneck_dim": None,
        #          "n_blocks": 0,
        #          "upsample_layer": "dense",
        #          "output_layer": "logistic regression",
        #          }],

        # ["T=1 SWIM Grad", ResNet,
        #         {"generator": generator,
        #         "in_dim": D,
        #         "hidden_size": hidden_size,
        #         "bottleneck_dim": None,
        #         "n_blocks": 0,
        #         "upsample_layer": "SWIM",
        #         "output_layer": "logistic regression",
        #         }],
        
        # ["T=1 SWIM Unif", ResNet,
        #         {"generator": generator,
        #         "in_dim": D,
        #         "hidden_size": hidden_size,
        #         "bottleneck_dim": None,
        #         "n_blocks": 0,
        #         "upsample_layer": "SWIM",
        #         "sampling_method": "uniform",
        #         "output_layer": "logistic regression",
        #         }],

    ]
    for lr in [1.0]:
        for l2_reg in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]:
            model_list += [
                [f"Logistic L-BFSG, l2={l2_reg} lr={lr}", 
                    LogisticRegression, 
                    {"generator": generator,
                    "in_dim": D,
                    "out_dim": 10,
                    "l2_reg": l2_reg,
                    "lr": lr,
                    }],
            ]

    # for n_blocks in [4]:
    #     model_list += [
    #     [f"T={n_blocks+1} End2End", E2EResNet,
    #             {"generator": generator,
    #             "in_dim": D,
    #             "hidden_size": hidden_size,
    #             "bottleneck_dim": bottleneck_dim,
    #             "out_dim": 10,
    #             "n_blocks": n_blocks,
    #             "activation": nn.Tanh(),
    #             "loss": nn.CrossEntropyLoss(),
    #             "lr": 1e-2,
    #             "epochs": num_epochs,
    #             "batch_size": batch_size}
    #             ],

    #     [f"T={n_blocks+1} ResSWIM Grad-dense", ResNet,
    #             {"generator": generator,
    #             "in_dim": D,
    #             "hidden_size": hidden_size,
    #             "bottleneck_dim": bottleneck_dim,
    #             "n_blocks": n_blocks,
    #             "upsample_layer": "SWIM",
    #             "res_layer1": "SWIM",
    #             "res_layer2": "dense",
    #             "output_layer": "logistic regression",
    #             }
    #             ],

    #     [f"T={n_blocks+1} ResSWIM Grad-id", ResNet,
    #             {"generator": generator,
    #             "in_dim": D,
    #             "hidden_size": hidden_size,
    #             "bottleneck_dim": hidden_size,
    #             "n_blocks": n_blocks,
    #             "upsample_layer": "SWIM",
    #             "res_layer1": "SWIM",
    #             "res_layer2": "identity",
    #             "output_layer": "logistic regression",
    #             }
    #             ],

    #     [f"T={n_blocks+1} ResDense", ResNet,
    #             {"generator": generator,
    #             "in_dim": D,
    #             "hidden_size": hidden_size,
    #             "bottleneck_dim": hidden_size,
    #             "n_blocks": n_blocks,
    #             "upsample_layer": "dense",
    #             "res_layer1": "dense",
    #             "res_layer2": "identity",
    #             "output_layer": "logistic regression",
    #             }
    #             ],
    # ]
    
    results = []
    model_names = []
    for name, model, model_args in model_list:
        print(name)
        t0 = time.perf_counter()
        model = model(**model_args).to(X_train.device)
        pred_train, _ = model.fit(X_train, y_train)
        t1 = time.perf_counter()
        pred_test = model(X_test)
        t2 = time.perf_counter()
        
        #convert to class predictions:
        if len(pred_train.shape) == 2:
            pred_train = torch.argmax(pred_train, dim=1)
            pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        result = np.array( [acc_train, acc_test, t1-t0, t2-t1] )
        results.append( result )
        model_names.append( name )

    return model_names, results



def run_all_experiments(
        name_save: str = "PLACEHOLDER",
        ):
    # Fetch and process each dataset
    experiments = {}
    generator = torch.Generator(device=device).manual_seed(999)
    results = run_allmodels_1dataset(
        generator, X_train, y_train, X_test, y_test, 
        )
    experiments["MNIST"] = results

    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["acc_train", "acc_test", "t_fit", "t_feat"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"MNIST_{name_save}.pkl")
    return df

In [23]:
run_all_experiments(name_save="TESTING")

Logistic L-BFSG, l2=1 lr=1.0
tensor(23.0018, device='cuda:0', grad_fn=<AddBackward0>)
tensor(22.8644, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0741, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8592, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4502, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4365, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4337, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4332, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4329, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4325, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4316, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4305, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4297, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4294, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4293, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4293, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4293, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4293, de

Unnamed: 0_level_0,acc_test,acc_test,acc_test,acc_test,acc_test,acc_test,acc_test,acc_train,acc_train,acc_train,...,t_feat,t_feat,t_feat,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit
Unnamed: 0_level_1,"Logistic L-BFSG, l2=0.0001 lr=1.0","Logistic L-BFSG, l2=0.001 lr=1.0","Logistic L-BFSG, l2=0.01 lr=1.0","Logistic L-BFSG, l2=0.1 lr=1.0","Logistic L-BFSG, l2=1 lr=1.0","Logistic L-BFSG, l2=1e-05 lr=1.0","Logistic L-BFSG, l2=1e-06 lr=1.0","Logistic L-BFSG, l2=0.0001 lr=1.0","Logistic L-BFSG, l2=0.001 lr=1.0","Logistic L-BFSG, l2=0.01 lr=1.0",...,"Logistic L-BFSG, l2=1 lr=1.0","Logistic L-BFSG, l2=1e-05 lr=1.0","Logistic L-BFSG, l2=1e-06 lr=1.0","Logistic L-BFSG, l2=0.0001 lr=1.0","Logistic L-BFSG, l2=0.001 lr=1.0","Logistic L-BFSG, l2=0.01 lr=1.0","Logistic L-BFSG, l2=0.1 lr=1.0","Logistic L-BFSG, l2=1 lr=1.0","Logistic L-BFSG, l2=1e-05 lr=1.0","Logistic L-BFSG, l2=1e-06 lr=1.0"
MNIST,0.9238,0.9231,0.9174,0.8991,0.848,0.9243,0.922,0.931,0.930167,0.918317,...,5.9e-05,5.4e-05,5.6e-05,0.192694,0.197226,0.192606,0.19768,0.232776,0.191077,0.199203


In [24]:
df = pd.read_pickle("MNIST_TESTING.pkl")
df["acc_test"].mean().sort_values(ascending=False)

Logistic L-BFSG, l2=1e-05 lr=1.0     0.9243
Logistic L-BFSG, l2=0.0001 lr=1.0    0.9238
Logistic L-BFSG, l2=0.001 lr=1.0     0.9231
Logistic L-BFSG, l2=1e-06 lr=1.0     0.9220
Logistic L-BFSG, l2=0.01 lr=1.0      0.9174
Logistic L-BFSG, l2=0.1 lr=1.0       0.8991
Logistic L-BFSG, l2=1 lr=1.0         0.8480
dtype: float64

In [None]:
# T=5 End2End        0.9717
# T=1 Dense          0.9215
# T=1 SWIM Unif      0.9207
# T=1 SWIM Grad      0.9204
# Logistic SGD       0.8990
# Tabular Ridge      0.8606
# Logistic L-BFSG    0.8480
# dtype: float64

In [None]:
df["acc_train"].mean().sort_values(ascending=False)

In [None]:
# Time to experiment with RandFeatureBoost

generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 800
num_epochs = 50
batch_size = 128
n_blocks = 10
adam_lr=1e-2
boost_lr=0.9

model = RandFeatBoost(
    generator=generator,
    in_dim=D,
    hidden_size=hidden_size,
    out_dim=10,
    n_blocks=n_blocks,
    activation=nn.Tanh(),
    loss_fn=nn.CrossEntropyLoss(),
    adam_lr=adam_lr,
    boost_lr=boost_lr,
    epochs=num_epochs,
    batch_size=batch_size,
    upscale_type="SWIM",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

In [None]:
print(acc_train)
print(acc_test)

In [None]:
model.deltas

In [None]:
def print_all_accuracies(X_train, X_test):
    X_train = model.upscale(X_train)
    X_test  = model.upscale(X_test)

    for t, (layer, DELTA, classifier) in enumerate(zip(model.layers, model.deltas, model.classifiers)):
        X_train = X_train + model.boost_lr * DELTA * (layer(X_train) - X_train)
        X_test = X_test + model.boost_lr * DELTA * (layer(X_test) - X_test)
        
        pred_train = classifier(X_train)
        pred_test = classifier(X_test)
        pred_train = torch.argmax(pred_train, dim=1)
        pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")


print_all_accuracies(X_train, X_test)

In [None]:
# Test SWIM-ID vs DENSE-ID vs SWIM-DENSE 
# implement 'finding gradient direction' gradient boosting

# Test whether this is actually better than non-boost with same hidden size !!!!!!!!!!!!!!!!!!!!!

In [None]:
# experiment with DENSE

generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 800
num_epochs = 50
batch_size = 128
n_blocks = 10
adam_lr=1e-2
boost_lr=0.9

model = RandFeatBoost(
    generator=generator,
    in_dim=D,
    hidden_size=hidden_size,
    out_dim=10,
    n_blocks=n_blocks,
    activation=nn.Tanh(),
    loss_fn=nn.CrossEntropyLoss(),
    adam_lr=adam_lr,
    boost_lr=boost_lr,
    epochs=num_epochs,
    batch_size=batch_size,
    upscale_type="dense",
    second_in_resblock="identity",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

In [None]:
def print_all_accuracies(X_train, X_test):
    X_train = model.upscale(X_train)
    X_test  = model.upscale(X_test)

    for t, (layer, DELTA, classifier) in enumerate(zip(model.layers, model.deltas, model.classifiers)):
        X_train = X_train + model.boost_lr * DELTA * (layer(X_train) - X_train)
        X_test = X_test + model.boost_lr * DELTA * (layer(X_test) - X_test)
        
        pred_train = classifier(X_train)
        pred_test = classifier(X_test)
        pred_train = torch.argmax(pred_train, dim=1)
        pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")

print_all_accuracies(X_train, X_test)