In [66]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeClassifierCVModule, E2EResNet, LogisticRegressionSGD, RandFeatBoost

np.set_printoptions(precision=3, threshold=5) # Print options
device = "cuda" #torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# MNIST

In [3]:
from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download and load the training data
mnist_path = "/home/nikita/hdd/MNIST"
trainset = datasets.MNIST(mnist_path, download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=len(trainset), shuffle=False)

# Download and load the test data
testset = datasets.MNIST(mnist_path, download=True, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=len(testset), shuffle=False)

# Flatten the data
X_train, y_train_cat = next(iter(trainloader))
X_train = X_train.view(len(trainset), -1).to(device)
X_test, y_test_cat = next(iter(testloader))
X_test = X_test.view(len(testset), -1).to(device)

# Convert train and test labels to one-hot encoding
y_train = nn.functional.one_hot(y_train_cat, num_classes=10).float().to(device)
y_test = nn.functional.one_hot(y_test_cat, num_classes=10).float().to(device)
y_train_cat = y_train_cat.to(device)
y_test_cat = y_test_cat.to(device)

# Normalize by mean and std
X_train, X_test = normalize_mean_std_traindata(X_train, X_test)
print(f"Train data shape: {X_train.shape}")
print(f"Train labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

Train data shape: torch.Size([60000, 784])
Train labels shape: torch.Size([60000, 10])
Test data shape: torch.Size([10000, 784])
Test labels shape: torch.Size([10000, 10])


# Logistic Regression

In [4]:
from models import LogisticRegression

N = 100
D = 50
C = 10
gen = torch.Generator().manual_seed(42)
X = torch.randn(N, D, generator=gen)
y = torch.randint(0, C, size=(N,), generator=gen)
model = LogisticRegression(
        gen,
        in_dim = D,
        out_dim = C,
        l2_reg = 1.0,
        max_iter = 100,
    )
_, _ = model.fit(X, y)

# Gradient Random Feature Boosting

In [46]:
from models import FittableModule, create_layer
from ridge_ALOOCV import fit_ridge_ALOOCV


def line_search_cross_entropy(cls, X, y, G_hat):
    """Solves the line search risk minimizatin problem
    R(W, X + a * g) for mutliclass cross entropy loss"""
    # No onehot encoding
    if y.dim() > 1:
        y_labels = torch.argmax(y, dim=1)
    else:
        y_labels = y

    # Optimize
    with torch.enable_grad():
        alpha = torch.tensor([0.0], requires_grad=True, device=X.device, dtype=X.dtype)
        optimizer = torch.optim.LBFGS([alpha])
        def closure():
            optimizer.zero_grad()
            logits = cls(X + alpha * G_hat)
            loss = nn.functional.cross_entropy(logits, y_labels)
            loss.backward()
            return loss
        optimizer.step(closure)

    return alpha.detach().item()



class GradientRandomFeatureBoostingClassification(FittableModule):
    def __init__(self, 
                 generator: torch.Generator, 
                 hidden_dim: int = 128, # TODO
                 bottleneck_dim: int = 128,
                 out_dim: int = 1,
                 n_layers: int = 5,
                 activation: nn.Module = nn.Tanh(),
                 l2_reg: float = 1,
                 feature_type = "SWIM", # "dense", identity
                 boost_lr: float = 1.0,
                 upscale: Optional[str] = "dense",
                 ):
        super(GradientRandomFeatureBoostingClassification, self).__init__()
        self.generator = generator
        self.hidden_dim = hidden_dim
        self.bottleneck_dim = bottleneck_dim
        self.out_dim = out_dim
        self.n_layers = n_layers
        self.activation = activation
        self.l2_reg = l2_reg
        self.feature_type = feature_type
        self.boost_lr = boost_lr
        self.upscale = upscale

        # save for now. for more memory efficient implementation, we can remove a lot of this
        self.classifiers = []
        self.alphas = []
        self.layers = []
        self.deltas = []


    def fit(self, X: Tensor, y: Tensor):
        with torch.no_grad():

            #optional upscale
            if self.upscale == "dense":
                self.upscale = create_layer(self.generator, self.upscale, X.shape[1], self.hidden_dim, None)
                X, y = self.upscale.fit(X, y)
            elif self.upscale == "SWIM":
                self.upscale = create_layer(self.generator, self.upscale, X.shape[1], self.hidden_dim, self.activation)
                X, y = self.upscale.fit(X, y)

            # Create classifier W_0
            cls = LogisticRegression(
                self.generator,
                in_dim = self.hidden_dim,
                out_dim = self.out_dim,
                l2_reg = self.l2_reg,
                max_iter = 100,
            ).to(X.device)
            cls.fit(X, y)
            self.classifiers.append(cls)

            # Layerwise boosting
            N = X.size(0)
            for t in range(self.n_layers):
                # Step 1: Create random feature layer   
                layer = create_layer(self.generator, self.feature_type, self.hidden_dim, self.bottleneck_dim, self.activation)
                F, y = layer.fit(X, y)

                # Step 2: Obtain activation gradient
                # X shape (N, D) --- ResNet neurons
                # F shape (N, p) --- random features
                # y shape (N, d) --- one-hot target
                # r shape (N, D) --- residual at currect boosting iteration
                # W shape (D, d) --- top level classifier
                # probs shape (N, d) --- predicted probabilities

                probs = nn.functional.softmax(cls(X), dim=1)
                G = (y - probs) @ cls.linear.weight #negative gradient TODO divide by N?

                # fit Least Squares to negative gradient (finding functional direction)
                Delta, Delta_b, _ = fit_ridge_ALOOCV(F, G)

                # Line search for risk minimization of R(W_t, Phi_t + linesearch * G_hat)
                G_hat = F @ Delta + Delta_b
                linesearch = line_search_cross_entropy(cls, X, y, G_hat)
                print("Linesearch", linesearch)

                # Step 3: Learn top level classifier
                X = X + self.boost_lr * linesearch * G_hat
                cls = LogisticRegression(
                    self.generator,
                    in_dim = self.hidden_dim,
                    out_dim = self.out_dim,
                    l2_reg = self.l2_reg,
                    max_iter = 20,
                ).to(X.device)
                cls.fit(
                    X, 
                    y, 
                    init_W_b = (cls.linear.weight.detach().clone(), cls.linear.bias.detach().clone()) #TODO do i want this? or start from scratch?
                )

                #update Delta scale
                Delta = Delta * linesearch
                Delta_b = Delta_b * linesearch

                # store
                self.layers.append(layer)
                self.deltas.append((Delta, Delta_b))
                self.classifiers.append(cls)

        return cls(X), y


    def forward(self, X: Tensor) -> Tensor:
        with torch.no_grad():
            if self.upscale is not None:
                X = self.upscale(X)
            for layer, (Delta, Delta_b) in zip(self.layers, self.deltas):
                X = X + self.boost_lr * (layer(X) @ Delta + Delta_b)
            return self.classifiers[-1](X)
        


N = 100
D = 50
C = 10
bottleneck_dim = 70
gen = torch.Generator().manual_seed(42)
X = torch.randn(N, D, generator=gen)
y = torch.randint(0, C, size=(N,), generator=gen)
y = nn.functional.one_hot(y, num_classes=C).float()
model = GradientRandomFeatureBoostingClassification(
        gen,
        hidden_dim = D,
        bottleneck_dim = bottleneck_dim,
        out_dim = C,
        n_layers = 2,
        upscale = "dense",
        feature_type = "dense",
    )
_, _ = model.fit(X, y)

Linesearch 0.060540515929460526
Linesearch 0.06334822624921799


# experiments

In [47]:
def run_allmodels_1dataset(
        generator: torch.Generator,
        X_train: Tensor,
        y_train: Tensor,
        X_test: Tensor,
        y_test: Tensor,
        ):
    
    D = X_train.shape[1]
    hidden_size = 1024
    bottleneck_dim = 1*hidden_size
    num_epochs = 50
    batch_size = 128

    # (name, model, kwargs). kwargs separate to save memory
    model_list = [
        # ["T=10 RandFeatureBoost", RandFeatBoost,
        #         {"generator": generator,
        #          "in_dim": D,
        #          "hidden_size": hidden_size,
        #          "out_dim": 10,
        #          "n_blocks": 9,
        #          "activation": nn.Tanh(),
        #          "loss_fn": nn.CrossEntropyLoss(),
        #          "adam_lr": 1e-2,
        #          "boost_lr": 1.0,
        #          "epochs": num_epochs,
        #          "batch_size": batch_size,
        #          "upscale_type": "SWIM",  # "dense", "identity"
        #          }],

        # ["Tabular Ridge", RidgeClassifierCVModule, {}],

        # ["Logistic SGD", LogisticRegressionSGD, 
        #         {"generator": generator,
        #          "num_epochs": num_epochs,
        #          "batch_size": batch_size,
        #          }],

        # ["Logistic L-BFSG", LogisticRegression, 
        #         {"generator": generator,
        #          "in_dim": D,
        #          "out_dim": 10,
        #          }],

        # ["T=1 Dense", ResNet,
        #         {"generator": generator,
        #          "in_dim": D,
        #          "hidden_size": hidden_size,
        #          "bottleneck_dim": None,
        #          "n_blocks": 0,
        #          "upsample_layer": "dense",
        #          "output_layer": "logistic regression",
        #          }],

        # ["T=1 SWIM Grad", ResNet,
        #         {"generator": generator,
        #         "in_dim": D,
        #         "hidden_size": hidden_size,
        #         "bottleneck_dim": None,
        #         "n_blocks": 0,
        #         "upsample_layer": "SWIM",
        #         "output_layer": "logistic regression",
        #         }],
        
        # ["T=1 SWIM Unif", ResNet,
        #         {"generator": generator,
        #         "in_dim": D,
        #         "hidden_size": hidden_size,
        #         "bottleneck_dim": None,
        #         "n_blocks": 0,
        #         "upsample_layer": "SWIM",
        #         "sampling_method": "uniform",
        #         "output_layer": "logistic regression",
        #         }],

    ]
    for lr in [1.0]:
        for l2_reg in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]:
            model_list += [
                [f"Logistic L-BFSG, l2={l2_reg} lr={lr}", 
                    LogisticRegression, 
                    {"generator": generator,
                    "in_dim": D,
                    "out_dim": 10,
                    "l2_reg": l2_reg,
                    "lr": lr,
                    }],
            ]
    
    for n_blocks in range(0, 20, 5):
        model_list += [
            [f"T={n_blocks+1} Gradient GRFBoost", 
             GradientRandomFeatureBoostingClassification,
                {"generator": generator,
                "hidden_dim": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "out_dim": 10,
                "n_layers": n_blocks,
                "activation": nn.Tanh(),
                "l2_reg": 0.00001,
                "feature_type": "SWIM",
                "boost_lr": 0.5,
                "upscale": "SWIM",
                },
                ],
        ]

    # for n_blocks in [4]:
    #     model_list += [
    #     [f"T={n_blocks+1} End2End", E2EResNet,
    #             {"generator": generator,
    #             "in_dim": D,
    #             "hidden_size": hidden_size,
    #             "bottleneck_dim": bottleneck_dim,
    #             "out_dim": 10,
    #             "n_blocks": n_blocks,
    #             "activation": nn.Tanh(),
    #             "loss": nn.CrossEntropyLoss(),
    #             "lr": 1e-2,
    #             "epochs": num_epochs,
    #             "batch_size": batch_size}
    #             ],

    #     [f"T={n_blocks+1} ResSWIM Grad-dense", ResNet,
    #             {"generator": generator,
    #             "in_dim": D,
    #             "hidden_size": hidden_size,
    #             "bottleneck_dim": bottleneck_dim,
    #             "n_blocks": n_blocks,
    #             "upsample_layer": "SWIM",
    #             "res_layer1": "SWIM",
    #             "res_layer2": "dense",
    #             "output_layer": "logistic regression",
    #             }
    #             ],

    #     [f"T={n_blocks+1} ResSWIM Grad-id", ResNet,
    #             {"generator": generator,
    #             "in_dim": D,
    #             "hidden_size": hidden_size,
    #             "bottleneck_dim": hidden_size,
    #             "n_blocks": n_blocks,
    #             "upsample_layer": "SWIM",
    #             "res_layer1": "SWIM",
    #             "res_layer2": "identity",
    #             "output_layer": "logistic regression",
    #             }
    #             ],

    #     [f"T={n_blocks+1} ResDense", ResNet,
    #             {"generator": generator,
    #             "in_dim": D,
    #             "hidden_size": hidden_size,
    #             "bottleneck_dim": hidden_size,
    #             "n_blocks": n_blocks,
    #             "upsample_layer": "dense",
    #             "res_layer1": "dense",
    #             "res_layer2": "identity",
    #             "output_layer": "logistic regression",
    #             }
    #             ],
    # ]
    
    results = []
    model_names = []
    for name, model, model_args in model_list:
        print(name)
        t0 = time.perf_counter()
        model = model(**model_args).to(X_train.device)
        pred_train, _ = model.fit(X_train, y_train)
        t1 = time.perf_counter()
        pred_test = model(X_test)
        t2 = time.perf_counter()
        
        #convert to class predictions:
        if len(pred_train.shape) == 2:
            pred_train = torch.argmax(pred_train, dim=1)
            pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        result = np.array( [acc_train, acc_test, t1-t0, t2-t1] )
        results.append( result )
        model_names.append( name )

    return model_names, results



def run_all_experiments(
        name_save: str = "PLACEHOLDER",
        ):
    # Fetch and process each dataset
    experiments = {}
    generator = torch.Generator(device=device).manual_seed(999)
    results = run_allmodels_1dataset(
        generator, X_train, y_train, X_test, y_test, 
        )
    experiments["MNIST"] = results

    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["acc_train", "acc_test", "t_fit", "t_feat"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"MNIST_{name_save}.pkl")
    return df

In [19]:
_ = run_all_experiments(name_save="TESTING")

Logistic L-BFSG, l2=1 lr=1.0
Logistic L-BFSG, l2=0.1 lr=1.0
Logistic L-BFSG, l2=0.01 lr=1.0
Logistic L-BFSG, l2=0.001 lr=1.0
Logistic L-BFSG, l2=0.0001 lr=1.0
Logistic L-BFSG, l2=1e-05 lr=1.0
Logistic L-BFSG, l2=1e-06 lr=1.0
T=1 Gradient GRFBoost
T=6 Gradient GRFBoost
Linesearch 0.12934249639511108
Linesearch 0.5913510322570801
Linesearch 0.8685545325279236
Linesearch 1.1419506072998047
Linesearch 1.4398552179336548
T=11 Gradient GRFBoost
Linesearch 0.12393397837877274
Linesearch 0.5805110931396484
Linesearch 0.8636003732681274
Linesearch 1.0384536981582642
Linesearch 1.3703676462173462
Linesearch 1.4415912628173828
Linesearch 1.8288248777389526
Linesearch 2.098085880279541
Linesearch 1.590389370918274
Linesearch 2.4286677837371826
T=16 Gradient GRFBoost
Linesearch 0.1339123398065567
Linesearch 0.5738614201545715
Linesearch 0.8393223881721497
Linesearch 1.082709789276123
Linesearch 1.1137275695800781
Linesearch 1.108267068862915
Linesearch 1.8491078615188599
Linesearch 1.58532702922821

In [54]:
df = pd.read_pickle("MNIST_TESTING.pkl")
df["acc_test"].mean().sort_values(ascending=False)

T=16 Gradient GRFBoost               0.9699
T=11 Gradient GRFBoost               0.9677
T=6 Gradient GRFBoost                0.9622
T=1 Gradient GRFBoost                0.9245
Logistic L-BFSG, l2=1e-05 lr=1.0     0.9243
Logistic L-BFSG, l2=0.0001 lr=1.0    0.9238
Logistic L-BFSG, l2=0.001 lr=1.0     0.9231
Logistic L-BFSG, l2=1e-06 lr=1.0     0.9220
Logistic L-BFSG, l2=0.01 lr=1.0      0.9174
Logistic L-BFSG, l2=0.1 lr=1.0       0.8991
Logistic L-BFSG, l2=1 lr=1.0         0.8480
dtype: float64

In [None]:
# T=5 End2End        0.9717
# T=1 Dense          0.9215
# T=1 SWIM Unif      0.9207
# T=1 SWIM Grad      0.9204
# Logistic SGD       0.8990
# Tabular Ridge      0.8606
# Logistic L-BFSG    0.8480
# dtype: float64

In [55]:
df["acc_train"].mean().sort_values(ascending=False)

T=16 Gradient GRFBoost               0.979667
T=11 Gradient GRFBoost               0.976983
T=6 Gradient GRFBoost                0.968950
Logistic L-BFSG, l2=1e-05 lr=1.0     0.931450
Logistic L-BFSG, l2=1e-06 lr=1.0     0.931200
Logistic L-BFSG, l2=0.0001 lr=1.0    0.931000
Logistic L-BFSG, l2=0.001 lr=1.0     0.930167
T=1 Gradient GRFBoost                0.924967
Logistic L-BFSG, l2=0.01 lr=1.0      0.918317
Logistic L-BFSG, l2=0.1 lr=1.0       0.894617
Logistic L-BFSG, l2=1 lr=1.0         0.838183
dtype: float64

# Experiment layer by layer with Gradient Boosting

In [48]:
# Time to experiment with RandFeatureBoost


generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 512
n_blocks = 20
boost_lr=1.0

model = GradientRandomFeatureBoostingClassification(
    generator=generator,
    hidden_dim=hidden_size,
    out_dim=10,
    n_layers=n_blocks,
    l2_reg=0.0001,
    boost_lr=boost_lr,
    feature_type="SWIM",
    upscale="SWIM",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

Linesearch 0.33945271372795105
Linesearch 0.6494695544242859
Linesearch 1.0083847045898438
Linesearch 1.5123884677886963
Linesearch 1.8697398900985718
Linesearch 1.9759445190429688
Linesearch 3.2475860118865967
Linesearch 3.6875827312469482
Linesearch 5.643091678619385
Linesearch 0.017211027443408966
Linesearch 0.01666221395134926
Linesearch 0.01779244653880596
Linesearch 0.01595654897391796
Linesearch 7.312465667724609
Linesearch 0.01290623564273119
Linesearch 0.013040311634540558
Linesearch 0.014813247136771679
Linesearch 0.01566963642835617
Linesearch 0.013982666656374931
Linesearch 0.012906176969408989


In [65]:
print(acc_train)
print(acc_test)

0.9619166851043701
0.9574999809265137


In [64]:
def print_all_accuracies(X_train, X_test):
    with torch.no_grad():
        X_train = model.upscale(X_train)
        X_test  = model.upscale(X_test)

        for t, (layer, (Delta, Delta_b), cls) in enumerate(zip(model.layers, model.deltas, model.classifiers[1:])):
            X_train += model.boost_lr * (layer(X_train)@Delta + Delta_b )
            X_test +=  model.boost_lr * (layer(X_test)@Delta + Delta_b )

            #delta norm

            pred_train = cls(X_train)
            pred_test = cls(X_test)
            pred_train = torch.argmax(pred_train, dim=1)
            pred_test = torch.argmax(pred_test, dim=1)
            acc_train = (pred_train == y_train_cat).float().mean().item()
            acc_test = (pred_test == y_test_cat).float().mean().item()

            print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")
            # print("delta norm", torch.linalg.norm(Delta).item())
            # print("X_train norm", torch.linalg.norm(X_train).item() / X_train.size(0))
            # print("X_test norm", torch.linalg.norm(X_test).item() / X_test.size(0))
            

print_all_accuracies(X_train, X_test)

OutOfMemoryError: CUDA out of memory. Tried to allocate 118.00 MiB. GPU 0 has a total capacity of 7.91 GiB of which 53.94 MiB is free. Including non-PyTorch memory, this process has 7.43 GiB memory in use. Of the allocated memory 7.12 GiB is allocated by PyTorch, and 167.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# idea: normalize the gradient before fitting the next layer. This is to find the optimal direction. Then do line search

# rand feat boost

In [41]:
# Time to experiment with RandFeatureBoost

generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 128
num_epochs = 30
batch_size = 512
n_blocks = 5
adam_lr=1e-2
boost_lr=0.9

model = RandFeatBoost(
    generator=generator,
    in_dim=D,
    hidden_size=hidden_size,
    out_dim=10,
    n_blocks=n_blocks,
    activation=nn.Tanh(),
    loss_fn=nn.CrossEntropyLoss(),
    adam_lr=adam_lr,
    boost_lr=boost_lr,
    epochs=num_epochs,
    batch_size=batch_size,
    upscale_type="SWIM",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

100%|██████████| 30/30 [00:19<00:00,  1.54it/s]
100%|██████████| 30/30 [00:20<00:00,  1.48it/s]
100%|██████████| 30/30 [00:19<00:00,  1.56it/s]
100%|██████████| 30/30 [00:20<00:00,  1.49it/s]
100%|██████████| 30/30 [00:19<00:00,  1.53it/s]


In [42]:
print(acc_train)
print(acc_test)

0.9162166714668274
0.9185000061988831


In [43]:
model.deltas

[Parameter containing:
 tensor([[-3.3867]], device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[-0.0875]], device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[0.1648]], device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[0.0923]], device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[-0.1482]], device='cuda:0', requires_grad=True)]

In [44]:
def print_all_accuracies(X_train, X_test):
    X_train = model.upscale(X_train)
    X_test  = model.upscale(X_test)

    for t, (layer, DELTA, classifier) in enumerate(zip(model.layers, model.deltas, model.classifiers)):
        X_train = X_train + model.boost_lr * DELTA * (layer(X_train) - X_train)
        X_test = X_test + model.boost_lr * DELTA * (layer(X_test) - X_test)
        
        pred_train = classifier(X_train)
        pred_test = classifier(X_test)
        pred_train = torch.argmax(pred_train, dim=1)
        pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")


print_all_accuracies(X_train, X_test)

Block 0: Train acc: 0.9017000198364258, Test acc: 0.9041999578475952
Block 1: Train acc: 0.9082333445549011, Test acc: 0.9110999703407288
Block 2: Train acc: 0.9126999974250793, Test acc: 0.9138000011444092
Block 3: Train acc: 0.912766695022583, Test acc: 0.9143999814987183
Block 4: Train acc: 0.9162166714668274, Test acc: 0.9185000061988831


In [None]:
# Test SWIM-ID vs DENSE-ID vs SWIM-DENSE 
# implement 'finding gradient direction' gradient boosting

# Test whether this is actually better than non-boost with same hidden size !!!!!!!!!!!!!!!!!!!!!

In [None]:
# experiment with DENSE

generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 800
num_epochs = 50
batch_size = 128
n_blocks = 10
adam_lr=1e-2
boost_lr=0.9

model = RandFeatBoost(
    generator=generator,
    in_dim=D,
    hidden_size=hidden_size,
    out_dim=10,
    n_blocks=n_blocks,
    activation=nn.Tanh(),
    loss_fn=nn.CrossEntropyLoss(),
    adam_lr=adam_lr,
    boost_lr=boost_lr,
    epochs=num_epochs,
    batch_size=batch_size,
    upscale_type="dense",
    second_in_resblock="identity",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

In [None]:
def print_all_accuracies(X_train, X_test):
    X_train = model.upscale(X_train)
    X_test  = model.upscale(X_test)

    for t, (layer, DELTA, classifier) in enumerate(zip(model.layers, model.deltas, model.classifiers)):
        X_train = X_train + model.boost_lr * DELTA * (layer(X_train) - X_train)
        X_test = X_test + model.boost_lr * DELTA * (layer(X_test) - X_test)
        
        pred_train = classifier(X_train)
        pred_test = classifier(X_test)
        pred_train = torch.argmax(pred_train, dim=1)
        pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")

print_all_accuracies(X_train, X_test)

In [None]:
# NEXT TIME: TODO TODO TODO TODO TODO

# do gradient boosting for classification

# xgboost model

# optuna (with xgboost to start with?)