In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split


np.set_printoptions(precision=3, threshold=5) # Print options
device = "cuda" # torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# MNIST

In [2]:
from torchvision import datasets, transforms


def normalize_mean_std_traindata(X_train: Tensor, X_test: Tensor) -> Tuple[Tensor, Tensor]:
    mean = X_train.mean(dim=0)
    std = X_train.std(dim=0)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    X_train = torch.clip(X_train, -5, 5)
    X_test = torch.clip(X_test, -5, 5)
    return X_train, X_test


# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download and load the training data
mnist_path = "/home/nikita/hdd/MNIST"
trainset = datasets.MNIST(mnist_path, download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=len(trainset), shuffle=False)

# Download and load the test data
testset = datasets.MNIST(mnist_path, download=True, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=len(testset), shuffle=False)

# Flatten the data
X_train, y_train_cat = next(iter(trainloader))
X_train = X_train.view(len(trainset), -1).to(device)
X_test, y_test_cat = next(iter(testloader))
X_test = X_test.view(len(testset), -1).to(device)

# Convert train and test labels to one-hot encoding
y_train = nn.functional.one_hot(y_train_cat, num_classes=10).float().to(device)
y_test = nn.functional.one_hot(y_test_cat, num_classes=10).float().to(device)
y_train_cat = y_train_cat.to(device)
y_test_cat = y_test_cat.to(device)

# Normalize by mean and std
X_train, X_test = normalize_mean_std_traindata(X_train, X_test)
print(f"Train data shape: {X_train.shape}")
print(f"Train labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

Train data shape: torch.Size([60000, 784])
Train labels shape: torch.Size([60000, 10])
Test data shape: torch.Size([10000, 784])
Test labels shape: torch.Size([10000, 10])


# Logistic Regression

In [3]:
from models.base import LogisticRegression

model = LogisticRegression(
        n_classes = 10,
        l2_lambda = 0.001,
        max_iter = 300,
    )
X_train_pred = model.fit_transform(X_train, y_train_cat)
X_test_pred = model(X_test)

print("X_test_pred", X_test_pred)

train_accuracy = (torch.argmax(X_train_pred, dim=1) == y_train_cat).float().mean().item()
test_accuracy = (torch.argmax(X_test_pred, dim=1) == y_test_cat).float().mean().item()

print(f"Train accuracy: {train_accuracy}")
print(f"Test accuracy: {test_accuracy}")

X_test_pred tensor([[ -0.2628, -10.2585,   0.3933,  ...,  11.1302,   0.0346,   3.3285],
        [  5.8857,   1.3818,  13.1877,  ..., -18.7712,   4.5571, -11.9242],
        [ -5.7085,   6.3627,   1.9223,  ...,   0.8333,   0.3595,  -1.5743],
        ...,
        [ -7.5470,  -7.3161,  -2.6649,  ...,   2.3265,   4.0623,   4.8242],
        [ -2.7713,  -1.8789,  -3.1412,  ...,  -4.0734,   6.4926,  -3.2449],
        [  2.6914, -10.5134,   4.8309,  ...,  -7.0523,  -0.4899,  -4.2300]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
Train accuracy: 0.9334999918937683
Test accuracy: 0.9266999959945679


# GradientRFBoost

In [4]:
from models.random_feature_representation_boosting import GradientRFRBoostClassifier

model = GradientRFRBoostClassifier(
    in_dim = 784,
    hidden_dim = 128,
    n_classes = 10,
    randfeat_xt_dim = 256,
    randfeat_x0_dim = 256,
    n_layers = 5,
    l2_cls = 0.000001,
    l2_ghat = 0.000001,
    feature_type="SWIM",
    upscale_type = "SWIM",
    lbfgs_max_iter = 300,
    boost_lr = 1.0,
    )
X_train_pred = model.fit_transform(X_train, y_train)
X_test_pred = model(X_test)

print("X_test_pred", X_test_pred)

train_accuracy = (torch.argmax(X_train_pred, dim=1) == y_train_cat).float().mean().item()
test_accuracy = (torch.argmax(X_test_pred, dim=1) == y_test_cat).float().mean().item()

print(f"Train accuracy: {train_accuracy}")
print(f"Test accuracy: {test_accuracy}")

#TODO NEXT: add xtx0 to the classification case

linesearch loss tensor(0.2828, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(1.0485, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.2482, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1925, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1737, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1721, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1720, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1720, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1496, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1112, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch loss tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward0>)
linesearch l

In [8]:
def see_results_for_every_layer(X_train, y_train, X_test, y_test, model, loss_fn):
    with torch.no_grad():
        X0_train = X_train
        X0_test = X_test

        X_train = model.upscale(X0_train)
        X_test = model.upscale(X0_test)

        pred_train = model.top_level_modules[0](X_train)
        pred_test = model.top_level_modules[0](X_test)

        ce = loss_fn(pred_train, y_train)
        ce_test = loss_fn(pred_test, y_test)
        acc = (pred_train.argmax(1) == y_train.argmax(1)).float().mean()
        acc_test = (pred_test.argmax(1) == y_test.argmax(1)).float().mean()
        print(f"Train ce at layer 0: {ce}")
        print(f"Test ce at layer 0: {ce_test}")
        print(f"Train acc at layer 0: {acc}")
        print(f"Test acc at layer 0: {acc_test}")
        print()
        
        for t, (feat_layer, ghat_layer, classifier) in enumerate(zip(model.random_feature_layers, 
                                                                     model.ghat_boosting_layers, 
                                                                     model.top_level_modules[1:])):
            features_train = feat_layer(X_train, X0_train)
            features_test = feat_layer(X_test, X0_test)
            X_train += model.boost_lr * ghat_layer(features_train)
            X_test  += model.boost_lr * ghat_layer(features_test)
            
            pred_train = classifier(X_train)
            pred_test = classifier(X_test)

            ce = loss_fn(pred_train, y_train)
            ce_test = loss_fn(pred_test, y_test)
            acc = (pred_train.argmax(1) == y_train.argmax(1)).float().mean()
            acc_test = (pred_test.argmax(1) == y_test.argmax(1)).float().mean()
            print(f"Train ce at layer {t+1}: {ce}")
            print(f"Test ce at layer {t+1}: {ce_test}")
            print(f"Train acc at layer {t+1}: {acc}")
            print(f"Test acc at layer {t+1}: {acc_test}")
            print()


see_results_for_every_layer(X_train, y_train, X_test, y_test, model, nn.functional.cross_entropy)

Train ce at layer 0: 0.2827766239643097
Test ce at layer 0: 0.28456902503967285
Train acc at layer 0: 0.9189833402633667
Test acc at layer 0: 0.9185000061988831

Train ce at layer 1: 0.14964938163757324
Test ce at layer 1: 0.16896314918994904
Train acc at layer 1: 0.9573500156402588
Test acc at layer 1: 0.9526000022888184

Train ce at layer 2: 0.10423363745212555
Test ce at layer 2: 0.13513188064098358
Train acc at layer 2: 0.9700333476066589
Test acc at layer 2: 0.9598999619483948

Train ce at layer 3: 0.08233209699392319
Test ce at layer 3: 0.11896318942308426
Train acc at layer 3: 0.9764500260353088
Test acc at layer 3: 0.9642999768257141

Train ce at layer 4: 0.0681803748011589
Test ce at layer 4: 0.11446826905012131
Train acc at layer 4: 0.9803500175476074
Test acc at layer 4: 0.965999960899353

Train ce at layer 5: 0.058496713638305664
Test ce at layer 5: 0.10933632403612137
Train acc at layer 5: 0.9836166501045227
Test acc at layer 5: 0.9668999910354614



# End2End

In [None]:
from models.models import End2EndMLPResNet

model = End2EndMLPResNet(
    in_dim = X_train.shape[1],
    hidden_dim = 128,
    bottleneck_dim = 32,
    out_dim = 10,
    n_blocks = 4,
    lr = 0.01,
    end_lr_factor = 0.01,
    n_epochs = 20,
    weight_decay = 0.001,
    batch_size = 512
    )
X_train_pred = model.fit_transform(X_train, y_train)
X_test_pred = model(X_test)

print("X_test_pred", X_test_pred)

train_accuracy = (torch.argmax(X_train_pred, dim=1) == y_train_cat).float().mean().item()
test_accuracy = (torch.argmax(X_test_pred, dim=1) == y_test_cat).float().mean().item()

print(f"Train accuracy: {train_accuracy}")
print(f"Test accuracy: {test_accuracy}")