In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split


np.set_printoptions(precision=3, threshold=5) # Print options
device = "cuda" # torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# MNIST

In [2]:
from torchvision import datasets, transforms


def normalize_mean_std_traindata(X_train: Tensor, X_test: Tensor) -> Tuple[Tensor, Tensor]:
    mean = X_train.mean(dim=0)
    std = X_train.std(dim=0)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    X_train = torch.clip(X_train, -5, 5)
    X_test = torch.clip(X_test, -5, 5)
    return X_train, X_test


# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download and load the training data
mnist_path = "/home/nikita/hdd/MNIST"
trainset = datasets.MNIST(mnist_path, download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=len(trainset), shuffle=False)

# Download and load the test data
testset = datasets.MNIST(mnist_path, download=True, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=len(testset), shuffle=False)

# Flatten the data
X_train, y_train_cat = next(iter(trainloader))
X_train = X_train.view(len(trainset), -1).to(device)
X_test, y_test_cat = next(iter(testloader))
X_test = X_test.view(len(testset), -1).to(device)

# Convert train and test labels to one-hot encoding
y_train = nn.functional.one_hot(y_train_cat, num_classes=10).float().to(device)
y_test = nn.functional.one_hot(y_test_cat, num_classes=10).float().to(device)
y_train_cat = y_train_cat.to(device)
y_test_cat = y_test_cat.to(device)

# Normalize by mean and std
X_train, X_test = normalize_mean_std_traindata(X_train, X_test)
print(f"Train data shape: {X_train.shape}")
print(f"Train labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

Train data shape: torch.Size([60000, 784])
Train labels shape: torch.Size([60000, 10])
Test data shape: torch.Size([10000, 784])
Test labels shape: torch.Size([10000, 10])


# Logistic Regression

In [3]:
from models.models import LogisticRegression, FittableModule


class LogisticRegression(FittableModule):
    def __init__(self, 
                 in_dim: int,
                 out_dim: int = 10,
                 l2_reg: float = 0.001,
                 lr: float = 1.0,
                 max_iter: int = 100,
                 ):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(in_dim, out_dim)
        self.l2_reg = l2_reg
        self.lr = lr
        self.max_iter = max_iter

        if out_dim > 1:
            self.loss = nn.functional.cross_entropy #this is with logits
        else:
            self.loss = nn.functional.binary_cross_entropy_with_logits


    def fit(self, 
            X: Tensor, 
            y: Tensor,
            init_W_b: Optional[Tuple[Tensor, Tensor]] = None,
            ):
        
        # No onehot encoding
        if y.dim() > 1:
            y_labels = torch.argmax(y, dim=1)
        else:
            y_labels = y

        # Put model on device
        device = X.device
        self.to(device)

        # Initialize weights and bias
        if init_W_b is not None:
            W, b = init_W_b
            self.linear.weight.data = W
            self.linear.bias.data = b
        else:
            nn.init.kaiming_normal_(self.linear.weight)
            nn.init.zeros_(self.linear.bias)
            
        with torch.enable_grad():
            # Optimize
            optimizer = torch.optim.LBFGS(self.linear.parameters(), lr=self.lr, max_iter=self.max_iter)
            def closure():
                optimizer.zero_grad()
                logits = self.linear(X)
                loss = self.loss(logits, y_labels)
                loss += self.l2_reg * torch.linalg.norm(self.linear.weight)**2
                loss.backward()
                return loss
            optimizer.step(closure)
        return self


    def forward(self, X: Tensor) -> Tensor:
        return self.linear(X)


model = LogisticRegression(
        in_dim = 784,
        out_dim = 10,
        l2_reg = 0.0001,
        max_iter = 100,
    )
X_train_pred = model.fit_transform(X_train, y_train_cat)
X_test_pred = model(X_test)

print("X_test_pred", X_test_pred)

train_accuracy = (torch.argmax(X_train_pred, dim=1) == y_train_cat).float().mean().item()
test_accuracy = (torch.argmax(X_test_pred, dim=1) == y_test_cat).float().mean().item()

print(f"Train accuracy: {train_accuracy}")
print(f"Test accuracy: {test_accuracy}")

X_test_pred tensor([[  0.4175, -11.0949,   1.2466,  ...,  12.8043,   1.6586,   5.1637],
        [  8.1309,   3.6030,  16.4524,  ..., -24.1260,   5.9731, -13.4601],
        [ -6.7998,   7.4932,   3.1322,  ...,   1.7728,   1.4733,  -1.5069],
        ...,
        [ -8.2448,  -8.1426,  -2.6560,  ...,   3.7084,   5.7502,   6.2365],
        [ -2.6307,  -1.3099,  -2.6204,  ...,  -4.6006,   9.3278,  -2.5876],
        [  4.1184, -11.7016,   6.9620,  ...,  -6.8436,   2.0377,  -5.2548]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
Train accuracy: 0.9338833689689636
Test accuracy: 0.9279999732971191


# GradientRFBoost

In [33]:
from models.models import FittableModule, create_layer, fit_ridge_ALOOCV, Identity

def line_search_cross_entropy(cls, X, y, G_hat):
    """Solves the line search risk minimizatin problem
    R(W, X + a * g) for mutliclass cross entropy loss"""
    # No onehot encoding
    if y.dim() > 1:
        y_labels = torch.argmax(y, dim=1)
    else:
        y_labels = y

    # Optimize
    with torch.enable_grad():
        alpha = torch.tensor([0.0], requires_grad=True, device=X.device, dtype=X.dtype)
        optimizer = torch.optim.LBFGS([alpha])
        def closure():
            optimizer.zero_grad()
            logits = cls(X + alpha * G_hat)
            loss = nn.functional.cross_entropy(logits, y_labels)
            loss.backward()
            return loss
        optimizer.step(closure)

    return alpha.detach().item()


class GradientRFBoostClassifier(FittableModule):
    def __init__(self, 
                 hidden_dim: int = 128, # TODO
                 randfeat_xt_dim: int = 128,
                 randfeat_x0_dim: int = 128,
                 out_dim: int = 10,
                 n_layers: int = 5,
                 activation: nn.Module = nn.Tanh(),
                 l2_reg: float = 1,
                 feature_type = "SWIM", # "dense", identity
                 boost_lr: float = 1.0,
                 upscale: Optional[str] = "dense",
                 max_iter: int = 100,
                 ridge_l2: float = 0.001,
                 ):
        super(GradientRFBoostClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.randfeat_xt_dim = randfeat_xt_dim
        self.randfeat_x0_dim = randfeat_x0_dim
        self.out_dim = out_dim
        self.n_layers = n_layers
        self.activation = activation
        self.l2_reg = l2_reg
        self.feature_type = feature_type
        self.boost_lr = boost_lr
        self.upscale = upscale
        self.max_iter = max_iter
        self.ridge_l2 = ridge_l2
    

    def fit_transform(self, X: Tensor, y: Tensor):
        with torch.no_grad():
            X0 = X

            #optional upscale
            if self.upscale == "dense":
                self.upscale_fun = create_layer(self.upscale, X.shape[1], self.hidden_dim, None)
                X = self.upscale_fun.fit_transform(X, y)
            elif self.upscale == "SWIM":
                self.upscale_fun = create_layer(self.upscale, X.shape[1], self.hidden_dim, self.activation)
                X = self.upscale_fun.fit_transform(X, y)
            elif self.upscale == "identity":
                self.upscale_fun = Identity()
                self.hidden_dim = X.size(1)
            else:
                raise ValueError(f"Parameter not recoginized. Given: {self.upscale}")


            # Create classifier W_0
            cls = LogisticRegression(
                in_dim = self.hidden_dim,
                out_dim = self.out_dim,
                l2_reg = self.l2_reg,
                max_iter = self.max_iter,
            ).to(X.device)
            cls.fit(X, y)
            # save for now. for more memory efficient implementation, we can remove a lot of this
            self.classifiers = [cls]
            self.layers_fxt = []
            self.layers_fx0 = []
            self.deltas = []

            # Layerwise boosting
            N = X.size(0)
            prev_cls = None if self.upscale != "identity" else cls
            for t in range(self.n_layers):
                # Step 2: Obtain activation gradient
                # X shape (N, D) --- ResNet neurons
                # F shape (N, p) --- random features
                # y shape (N, d) --- one-hot target
                # r shape (N, D) --- residual at currect boosting iteration
                # W shape (D, d) --- top level classifier
                # probs shape (N, d) --- predicted probabilities


                # Step 1: Create random feature layer   
                fxt_fun = create_layer(self.feature_type, self.hidden_dim, self.randfeat_xt_dim, self.activation)
                fx0_fun = create_layer(self.feature_type, X0.size(1), self.randfeat_x0_dim, self.activation)
                Fxt = fxt_fun.fit_transform(X, y)
                Fx0 = fx0_fun.fit_transform(X0, y)
                F = torch.cat([Fxt, Fx0], dim=1)


                # Step 2: Obtain activation gradient and learn Delta
                probs = nn.functional.softmax(cls(X), dim=1)
                G = (y - probs) @ cls.linear.weight #negative gradient TODO divide by N?
                G = G / torch.norm(G) * N**0.5 #normalize to unit L2(mu) norm?
                # fit Least Squares to negative gradient (finding functional direction)
                Delta, Delta_b, _ = fit_ridge_ALOOCV(F, G, alphas=[self.ridge_l2])
                # Line search for risk minimization of R(W_t, Phi_t + linesearch * G_hat)
                G_hat = F @ Delta + Delta_b
                linesearch = line_search_cross_entropy(cls, X, y, G_hat)
                print("Linesearch", linesearch)
                print("t", t, "Gradient hat norm", torch.linalg.norm(G_hat))


                # Step 3: Learn top level classifier
                X = X + self.boost_lr * linesearch * G_hat
                cls = LogisticRegression(
                    in_dim = self.hidden_dim,
                    out_dim = self.out_dim,
                    l2_reg = self.l2_reg,
                    max_iter = self.max_iter,
                ).to(X.device)
                cls.fit(
                    X, 
                    y, 
                    init_W_b = (
                        (prev_cls.linear.weight.detach().clone(), 
                        prev_cls.linear.bias.detach().clone())
                        if prev_cls is not None else None
                        ) 
                )
                prev_cls = cls

                #update Delta scale
                Delta = Delta * linesearch
                Delta_b = Delta_b * linesearch

                # store
                self.layers_fxt.append(fxt_fun)
                self.layers_fx0.append(fx0_fun)
                self.deltas.append((Delta, Delta_b))
                self.classifiers.append(cls)

        return cls(X)


    def forward(self, X: Tensor) -> Tensor:
        with torch.no_grad():
            X0 = X
            if self.upscale is not None:
                X = self.upscale_fun(X)
            for fxt_fun, fx0_fun, (Delta, Delta_b) in zip(self.layers_fxt, self.layers_fx0, self.deltas):
                features = torch.cat([fxt_fun(X), fx0_fun(X0)], dim=1)
                X = X + self.boost_lr * (features @ Delta + Delta_b)
            return self.classifiers[-1](X)
        

model = GradientRFBoostClassifier(
        hidden_dim = 128,
        randfeat_xt_dim = 256,
        randfeat_x0_dim = 256,
        out_dim = 10,
        n_layers = 20,
        l2_reg = 0.000001,
        ridge_l2 = 0.000000001,
        feature_type="SWIM",
        upscale = "SWIM",
        max_iter = 300,
        boost_lr = 1.0,
    )
X_train_pred = model.fit_transform(X_train, y_train)
X_test_pred = model(X_test)

print("X_test_pred", X_test_pred)

train_accuracy = (torch.argmax(X_train_pred, dim=1) == y_train_cat).float().mean().item()
test_accuracy = (torch.argmax(X_test_pred, dim=1) == y_test_cat).float().mean().item()

print(f"Train accuracy: {train_accuracy}")
print(f"Test accuracy: {test_accuracy}")

#TODO NEXT: add xtx0 to the classification case

Linesearch 0.04402843490242958
t 0 Gradient hat norm tensor(254.2389, device='cuda:0')
Linesearch 0.007105377968400717
t 1 Gradient hat norm tensor(459.0419, device='cuda:0')
Linesearch 0.13384537398815155
t 2 Gradient hat norm tensor(67.2621, device='cuda:0')
Linesearch 0.1739414483308792
t 3 Gradient hat norm tensor(46.2600, device='cuda:0')
Linesearch 0.23741832375526428
t 4 Gradient hat norm tensor(33.9673, device='cuda:0')
Linesearch 0.2939264476299286
t 5 Gradient hat norm tensor(26.8308, device='cuda:0')
Linesearch 0.299819678068161
t 6 Gradient hat norm tensor(23.1804, device='cuda:0')
Linesearch 0.20965401828289032
t 7 Gradient hat norm tensor(26.7779, device='cuda:0')
Linesearch 0.2635953724384308
t 8 Gradient hat norm tensor(22.8784, device='cuda:0')
Linesearch 0.33549538254737854
t 9 Gradient hat norm tensor(18.4807, device='cuda:0')
Linesearch 0.23749461770057678
t 10 Gradient hat norm tensor(20.3547, device='cuda:0')
Linesearch 0.07007607817649841
t 11 Gradient hat norm t

In [34]:
def see_results_for_every_layer(X_train, X_test):
    with torch.no_grad():
        X0_train = X_train
        X0_test = X_test

        if model.upscale is not None:
            X_train = model.upscale_fun(X0_train)
            X_test = model.upscale_fun(X0_test)

        y_pred_train = model.classifiers[0](X_train)
        y_pred_test = model.classifiers[0](X_test)
        print(f"Train acc at layer 0: {torch.argmax(y_pred_train, dim=1).eq(y_train_cat).float().mean()}")
        print(f"Test acc at layer 0: {torch.argmax(y_pred_test, dim=1).eq(y_test_cat).float().mean()}")
        print()
        
        for t, (fxt_fun, fx0_fun, (Delta, Delta_b)) in enumerate(zip(model.layers_fxt, model.layers_fx0, model.deltas)):
            features_train = torch.cat([fxt_fun(X_train), fx0_fun(X0_train)], dim=1)
            features_test = torch.cat([fxt_fun(X_test), fx0_fun(X0_test)], dim=1)
            X_train = X_train + model.boost_lr * (features_train @ Delta + Delta_b)
            X_test = X_test + model.boost_lr * (features_test @ Delta + Delta_b)
            
            y_pred_train = model.classifiers[t+1](X_train)
            y_pred_test = model.classifiers[t+1](X_test)

            print(f"Train acc at layer {t+1}: {torch.argmax(y_pred_train, dim=1).eq(y_train_cat).float().mean()}")
            print(f"Test acc at layer {t+1}: {torch.argmax(y_pred_test, dim=1).eq(y_test_cat).float().mean()}")
            print()


see_results_for_every_layer(X_train, X_test)

Train acc at layer 0: 0.9188666939735413
Test acc at layer 0: 0.91839998960495

Train acc at layer 1: 0.9416000247001648
Test acc at layer 1: 0.9378999471664429

Train acc at layer 2: 0.9431833624839783
Test acc at layer 2: 0.9405999779701233

Train acc at layer 3: 0.9627000093460083
Test acc at layer 3: 0.9580999612808228

Train acc at layer 4: 0.9702666997909546
Test acc at layer 4: 0.9627999663352966

Train acc at layer 5: 0.9768500328063965
Test acc at layer 5: 0.9673999547958374

Train acc at layer 6: 0.9818500280380249
Test acc at layer 6: 0.9684000015258789

Train acc at layer 7: 0.9851000308990479
Test acc at layer 7: 0.9702000021934509

Train acc at layer 8: 0.987416684627533
Test acc at layer 8: 0.9699999690055847

Train acc at layer 9: 0.9891000390052795
Test acc at layer 9: 0.9703999757766724

Train acc at layer 10: 0.9913666844367981
Test acc at layer 10: 0.9720999598503113

Train acc at layer 11: 0.9924499988555908
Test acc at layer 11: 0.9718999862670898

Train acc at la

# End2End

In [6]:
from models.models import End2EndMLPResNet

model = End2EndMLPResNet(
    in_dim = X_train.shape[1],
    hidden_dim = 128,
    bottleneck_dim = 32,
    out_dim = 10,
    n_blocks = 4,
    lr = 0.01,
    end_lr_factor = 0.01,
    n_epochs = 20,
    weight_decay = 0.001,
    batch_size = 512
    )
X_train_pred = model.fit_transform(X_train, y_train)
X_test_pred = model(X_test)

print("X_test_pred", X_test_pred)

train_accuracy = (torch.argmax(X_train_pred, dim=1) == y_train_cat).float().mean().item()
test_accuracy = (torch.argmax(X_test_pred, dim=1) == y_test_cat).float().mean().item()

print(f"Train accuracy: {train_accuracy}")
print(f"Test accuracy: {test_accuracy}")

  0%|          | 0/20 [00:00<?, ?it/s]

 80%|████████  | 16/20 [00:17<00:04,  1.12s/it]


KeyboardInterrupt: 

# experiments

In [None]:
_ = run_all_experiments(name_save="TESTING")

NameError: name 'run_all_experiments' is not defined

In [None]:
df = pd.read_pickle("MNIST_TESTING.pkl")
df["acc_test"].mean().sort_values(ascending=False)

T=16 Gradient GRFBoost               0.9690
T=11 Gradient GRFBoost               0.9684
T=6 Gradient GRFBoost                0.9649
Logistic L-BFSG, l2=1e-05 lr=1.0     0.9243
Logistic L-BFSG, l2=0.0001 lr=1.0    0.9238
Logistic L-BFSG, l2=0.001 lr=1.0     0.9231
T=1 Gradient GRFBoost                0.9226
Logistic L-BFSG, l2=1e-06 lr=1.0     0.9220
Logistic L-BFSG, l2=0.01 lr=1.0      0.9174
Logistic L-BFSG, l2=0.1 lr=1.0       0.8991
Logistic L-BFSG, l2=1 lr=1.0         0.8480
dtype: float64

In [None]:
# T=5 End2End        0.9717
# T=1 Dense          0.9215
# T=1 SWIM Unif      0.9207
# T=1 SWIM Grad      0.9204
# Logistic SGD       0.8990
# Tabular Ridge      0.8606
# Logistic L-BFSG    0.8480
# dtype: float64

In [None]:
df["acc_train"].mean().sort_values(ascending=False)

T=16 Gradient GRFBoost               0.978417
T=11 Gradient GRFBoost               0.976083
T=6 Gradient GRFBoost                0.973017
Logistic L-BFSG, l2=1e-05 lr=1.0     0.931450
Logistic L-BFSG, l2=1e-06 lr=1.0     0.931200
Logistic L-BFSG, l2=0.0001 lr=1.0    0.931000
Logistic L-BFSG, l2=0.001 lr=1.0     0.930167
T=1 Gradient GRFBoost                0.922150
Logistic L-BFSG, l2=0.01 lr=1.0      0.918317
Logistic L-BFSG, l2=0.1 lr=1.0       0.894617
Logistic L-BFSG, l2=1 lr=1.0         0.838183
dtype: float64

# Experiment layer by layer with Gradient Boosting

In [None]:
# Time to experiment with RandFeatureBoost


generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 512
n_blocks = 20
boost_lr=1.0

model = GradientRandomFeatureBoostingClassification(
    generator=generator,
    hidden_dim=hidden_size,
    out_dim=10,
    n_layers=n_blocks,
    l2_reg=0.0001,
    boost_lr=boost_lr,
    feature_type="SWIM",
    upscale="SWIM",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

alpha 0.009999999776482582
linesearch loss tensor(0.3351, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2944, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2499, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward0>)
Linesearch 0.33945271372795105
Gradient hat norm tensor(198.0060, device='cuda:0')
alpha 0.009999999776482582
linesearch loss tensor(0.2977, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2372, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2181, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2154, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.21

In [None]:
print(acc_train)
print(acc_test)

0.9626833200454712
0.9571999907493591


In [None]:
def print_all_accuracies(X_train, X_test):
    with torch.no_grad():
        X_train = model.upscale(X_train)
        X_test  = model.upscale(X_test)

        for t, (layer, (Delta, Delta_b), cls) in enumerate(zip(model.layers, model.deltas, model.classifiers[1:])):
            X_train += model.boost_lr * (layer(X_train)@Delta + Delta_b )
            X_test +=  model.boost_lr * (layer(X_test)@Delta + Delta_b )

            #delta norm

            pred_train = cls(X_train)
            pred_test = cls(X_test)
            pred_train = torch.argmax(pred_train, dim=1)
            pred_test = torch.argmax(pred_test, dim=1)
            acc_train = (pred_train == y_train_cat).float().mean().item()
            acc_test = (pred_test == y_test_cat).float().mean().item()

            print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")
            # print("delta norm", torch.linalg.norm(Delta).item())
            # print("X_train norm", torch.linalg.norm(X_train).item() / X_train.size(0))
            # print("X_test norm", torch.linalg.norm(X_test).item() / X_test.size(0))
            

print_all_accuracies(X_train, X_test)

Block 0: Train acc: 0.9140666723251343, Test acc: 0.9178999662399292
Block 1: Train acc: 0.9391999840736389, Test acc: 0.9405999779701233
Block 2: Train acc: 0.949066698551178, Test acc: 0.9469999670982361
Block 3: Train acc: 0.9535666704177856, Test acc: 0.9508000016212463
Block 4: Train acc: 0.95660001039505, Test acc: 0.9531999826431274
Block 5: Train acc: 0.9593999981880188, Test acc: 0.9551999568939209
Block 6: Train acc: 0.9609000086784363, Test acc: 0.9566999673843384
Block 7: Train acc: 0.9617166519165039, Test acc: 0.9573999643325806
Block 8: Train acc: 0.9622666835784912, Test acc: 0.9566999673843384
Block 9: Train acc: 0.9624500274658203, Test acc: 0.9565999507904053
Block 10: Train acc: 0.9620500206947327, Test acc: 0.9563999772071838
Block 11: Train acc: 0.9622333645820618, Test acc: 0.9565999507904053
Block 12: Train acc: 0.9622666835784912, Test acc: 0.9563999772071838
Block 13: Train acc: 0.9622833728790283, Test acc: 0.9565999507904053
Block 14: Train acc: 0.9623667001

In [None]:
# idea: normalize the gradient before fitting the next layer. This is to find the optimal direction. Then do line search

# rand feat boost

In [None]:
# Time to experiment with RandFeatureBoost

generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 128
num_epochs = 30
batch_size = 512
n_blocks = 5
adam_lr=1e-2
boost_lr=0.9

model = RandFeatBoost(
    generator=generator,
    in_dim=D,
    hidden_size=hidden_size,
    out_dim=10,
    n_blocks=n_blocks,
    activation=nn.Tanh(),
    loss_fn=nn.CrossEntropyLoss(),
    adam_lr=adam_lr,
    boost_lr=boost_lr,
    epochs=num_epochs,
    batch_size=batch_size,
    upscale_type="SWIM",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

 17%|█▋        | 5/30 [00:03<00:19,  1.27it/s]


KeyboardInterrupt: 

In [None]:
print(acc_train)
print(acc_test)

In [None]:
model.deltas

In [None]:
def print_all_accuracies(X_train, X_test):
    X_train = model.upscale(X_train)
    X_test  = model.upscale(X_test)

    for t, (layer, DELTA, classifier) in enumerate(zip(model.layers, model.deltas, model.classifiers)):
        X_train = X_train + model.boost_lr * DELTA * (layer(X_train) - X_train)
        X_test = X_test + model.boost_lr * DELTA * (layer(X_test) - X_test)
        
        pred_train = classifier(X_train)
        pred_test = classifier(X_test)
        pred_train = torch.argmax(pred_train, dim=1)
        pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")


print_all_accuracies(X_train, X_test)

In [None]:
# Test SWIM-ID vs DENSE-ID vs SWIM-DENSE 
# implement 'finding gradient direction' gradient boosting

# Test whether this is actually better than non-boost with same hidden size !!!!!!!!!!!!!!!!!!!!!

In [None]:
# experiment with DENSE

generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 800
num_epochs = 50
batch_size = 128
n_blocks = 10
adam_lr=1e-2
boost_lr=0.9

model = RandFeatBoost(
    generator=generator,
    in_dim=D,
    hidden_size=hidden_size,
    out_dim=10,
    n_blocks=n_blocks,
    activation=nn.Tanh(),
    loss_fn=nn.CrossEntropyLoss(),
    adam_lr=adam_lr,
    boost_lr=boost_lr,
    epochs=num_epochs,
    batch_size=batch_size,
    upscale_type="dense",
    second_in_resblock="identity",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

In [None]:
def print_all_accuracies(X_train, X_test):
    X_train = model.upscale(X_train)
    X_test  = model.upscale(X_test)

    for t, (layer, DELTA, classifier) in enumerate(zip(model.layers, model.deltas, model.classifiers)):
        X_train = X_train + model.boost_lr * DELTA * (layer(X_train) - X_train)
        X_test = X_test + model.boost_lr * DELTA * (layer(X_test) - X_test)
        
        pred_train = classifier(X_train)
        pred_test = classifier(X_test)
        pred_train = torch.argmax(pred_train, dim=1)
        pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")

print_all_accuracies(X_train, X_test)

In [None]:
# NEXT TIME: TODO TODO TODO TODO TODO

# do gradient boosting for BINARY CLASSIFICATION
 
# do f(x_t, x_0) and not just f(x_t)

# xgboost model

# optuna (with xgboost to start with?)