In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeClassifierCVModule, E2EResNet, LogisticRegressionModule, RandFeatBoost

np.set_printoptions(precision=3, threshold=5) # Print options
device = "cuda" #torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# MNIST

In [2]:
from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download and load the training data
mnist_path = "/home/nikita/hdd/MNIST"
trainset = datasets.MNIST(mnist_path, download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=len(trainset), shuffle=False)

# Download and load the test data
testset = datasets.MNIST(mnist_path, download=True, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=len(testset), shuffle=False)

# Flatten the data
X_train, y_train_cat = next(iter(trainloader))
X_train = X_train.view(len(trainset), -1).to(device)
X_test, y_test_cat = next(iter(testloader))
X_test = X_test.view(len(testset), -1).to(device)

# Convert train and test labels to one-hot encoding
y_train = F.one_hot(y_train_cat, num_classes=10).float().to(device)
y_test = F.one_hot(y_test_cat, num_classes=10).float().to(device)
y_train_cat = y_train_cat.to(device)
y_test_cat = y_test_cat.to(device)

# Normalize by mean and std
X_train, X_test = normalize_mean_std_traindata(X_train, X_test)
print(f"Train data shape: {X_train.shape}")
print(f"Train labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

Train data shape: torch.Size([60000, 784])
Train labels shape: torch.Size([60000, 10])
Test data shape: torch.Size([10000, 784])
Test labels shape: torch.Size([10000, 10])


# experiments

In [None]:
def run_allmodels_1dataset(
        generator: torch.Generator,
        X_train: Tensor,
        y_train: Tensor,
        X_test: Tensor,
        y_test: Tensor,
        ):
    
    D = X_train.shape[1]
    hidden_size = 800
    bottleneck_dim = 1*hidden_size
    num_epochs = 29
    batch_size = 64

    # (name, model, kwargs). kwargs separate to save memory
    model_list = [
        # ["T=10 RandFeatureBoost", RandFeatBoost,
        #         {"generator": generator,
        #          "in_dim": D,
        #          "hidden_size": hidden_size,
        #          "out_dim": 10,
        #          "n_blocks": 9,
        #          "activation": nn.Tanh(),
        #          "loss_fn": nn.CrossEntropyLoss(),
        #          "adam_lr": 1e-1,
        #          "boost_lr": 0.5,
        #          "epochs": num_epochs,
        #          "batch_size": batch_size,
        #          "upscale_type": "SWIM",  # "dense", "identity"
        #          }],

        ["Tabular Ridge", RidgeClassifierCVModule, {}],

        ["Logistic Regression", LogisticRegressionModule, 
                {"generator": generator,
                 "num_epochs": num_epochs,
                 "batch_size": batch_size,
                 }],

        ["T=1 Dense", ResNet,
                {"generator": generator,
                 "in_dim": D,
                 "hidden_size": hidden_size,
                 "bottleneck_dim": None,
                 "n_blocks": 0,
                 "upsample_layer": "dense",
                 "output_layer": "logistic regression",
                 }],

        ["T=1 SWIM Grad", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": None,
                "n_blocks": 0,
                "upsample_layer": "SWIM",
                "output_layer": "logistic regression",
                }],
        
        ["T=1 SWIM Unif", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": None,
                "n_blocks": 0,
                "upsample_layer": "SWIM",
                "sampling_method": "uniform",
                "output_layer": "logistic regression",
                }],

    ]

    for n_blocks in [2]:
        model_list += [
        [f"T={n_blocks+1} End2End", E2EResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "out_dim": 10,
                "n_blocks": n_blocks,
                "activation": nn.Tanh(),
                "loss": nn.CrossEntropyLoss(),
                "lr": 1e-3,
                "epochs": num_epochs,
                "batch_size": batch_size}
                ],

        [f"T={n_blocks+1} ResSWIM Grad-dense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "n_blocks": n_blocks,
                "upsample_layer": "SWIM",
                "res_layer1": "SWIM",
                "res_layer2": "dense",
                "output_layer": "logistic regression",
                }
                ],

        [f"T={n_blocks+1} ResSWIM Grad-id", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "SWIM",
                "res_layer1": "SWIM",
                "res_layer2": "identity",
                "output_layer": "logistic regression",
                }
                ],

        [f"T={n_blocks+1} ResDense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "dense",
                "res_layer1": "dense",
                "res_layer2": "identity",
                "output_layer": "logistic regression",
                }
                ],
    ]
    
    results = []
    model_names = []
    for name, model, model_args in model_list:
        t0 = time.perf_counter()
        model = model(**model_args).to(X_train.device)
        pred_train, _ = model.fit(X_train, y_train)
        t1 = time.perf_counter()
        pred_test = model(X_test)
        t2 = time.perf_counter()
        
        #convert to class predictions:
        if len(pred_train.shape) == 2:
            pred_train = torch.argmax(pred_train, dim=1)
            pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        result = np.array( [acc_train, acc_test, t1-t0, t2-t1] )
        results.append( result )
        model_names.append( name )

    return model_names, results



def run_all_experiments(
        name_save: str = "PLACEHOLDER",
        ):
    # Fetch and process each dataset
    experiments = {}
    generator = torch.Generator(device=device).manual_seed(999)
    results = run_allmodels_1dataset(
        generator, X_train, y_train, X_test, y_test, 
        )
    experiments["MNIST"] = results

    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["acc_train", "acc_test", "t_fit", "t_feat"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"MNIST_{name_save}.pkl")
    return df

In [None]:
run_all_experiments()

In [None]:
df = pd.read_pickle("MNIST_PLACEHOLDER.pkl")
df["acc_test"].mean().sort_values(ascending=False)

In [None]:
df["acc_train"].mean().sort_values(ascending=False)

In [4]:
# Time to experiment with RandFeatureBoost

generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 800
num_epochs = 50
batch_size = 128
n_blocks = 10
adam_lr=1e-2
boost_lr=0.9

model = RandFeatBoost(
    generator=generator,
    in_dim=D,
    hidden_size=hidden_size,
    out_dim=10,
    n_blocks=n_blocks,
    activation=nn.Tanh(),
    loss_fn=nn.CrossEntropyLoss(),
    adam_lr=adam_lr,
    boost_lr=boost_lr,
    epochs=num_epochs,
    batch_size=batch_size,
    upscale_type="SWIM",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

100%|██████████| 50/50 [01:01<00:00,  1.22s/it]
100%|██████████| 50/50 [01:00<00:00,  1.20s/it]
100%|██████████| 50/50 [01:00<00:00,  1.20s/it]
100%|██████████| 50/50 [01:00<00:00,  1.20s/it]
100%|██████████| 50/50 [01:00<00:00,  1.20s/it]
100%|██████████| 50/50 [00:53<00:00,  1.06s/it]
100%|██████████| 50/50 [00:50<00:00,  1.00s/it]
100%|██████████| 50/50 [00:50<00:00,  1.01s/it]
100%|██████████| 50/50 [00:50<00:00,  1.01s/it]
100%|██████████| 50/50 [00:50<00:00,  1.02s/it]


In [5]:
print(acc_train)
print(acc_test)

0.9501833319664001
0.9465999603271484


In [6]:
model.deltas

[Parameter containing:
 tensor([[-2.4999e-01,  1.5847e-01, -9.6917e-02,  5.1325e-01,  2.3316e+00,
          -9.8329e-02, -5.6916e-01, -7.2943e-01,  2.7951e-02, -2.4800e-01,
          -2.0586e-02,  2.0850e-01, -1.8774e-01, -5.2455e-01,  9.8108e-01,
           2.4364e-01,  1.0746e+00,  3.6436e+00,  8.6639e-01, -5.8941e-01,
          -1.1042e-02, -7.6476e-01, -2.7303e-01,  4.9569e-01, -8.0262e-01,
          -6.3513e-01, -6.4203e-01, -2.2794e-02,  3.1683e+00, -3.0356e-01,
           3.6337e-01,  2.0556e+00, -6.1344e-01, -2.5678e-01,  4.3111e-01,
           2.0943e-03,  1.9952e-01, -3.8509e-01,  3.3255e-01,  2.8514e-01,
           2.2267e-01, -1.7652e-01, -5.0319e-01,  1.5762e-01,  3.6704e+00,
          -4.1711e-01,  2.3676e+00, -4.3934e-01,  1.8680e-01,  1.4247e+00,
           4.2982e-01,  1.7370e-01,  8.0422e-01, -4.9304e-02, -9.3388e-02,
          -8.6865e-03,  4.4489e-01,  3.6154e-01, -3.3481e-01, -1.6179e-01,
           2.6611e+00,  4.8760e-01, -1.1688e+00,  3.7667e-02, -9.4025e-02,
  

In [7]:
def print_all_accuracies(X_train, X_test):
    X_train = model.upscale(X_train)
    X_test  = model.upscale(X_test)

    for t, (layer, DELTA, classifier) in enumerate(zip(model.layers, model.deltas, model.classifiers)):
        X_train = X_train + model.boost_lr * DELTA * (layer(X_train) - X_train)
        X_test = X_test + model.boost_lr * DELTA * (layer(X_test) - X_test)
        
        pred_train = classifier(X_train)
        pred_test = classifier(X_test)
        pred_train = torch.argmax(pred_train, dim=1)
        pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")


print_all_accuracies(X_train, X_test)

Block 0: Train acc: 0.8996666669845581, Test acc: 0.9000999927520752
Block 1: Train acc: 0.9217000007629395, Test acc: 0.9202999472618103
Block 2: Train acc: 0.9368166923522949, Test acc: 0.9350999593734741
Block 3: Train acc: 0.9419666528701782, Test acc: 0.9393999576568604
Block 4: Train acc: 0.9418833255767822, Test acc: 0.9406999945640564
Block 5: Train acc: 0.942466676235199, Test acc: 0.9407999515533447
Block 6: Train acc: 0.938800036907196, Test acc: 0.9328999519348145
Block 7: Train acc: 0.9495999813079834, Test acc: 0.9476999640464783
Block 8: Train acc: 0.9479666948318481, Test acc: 0.9457999467849731
Block 9: Train acc: 0.9501833319664001, Test acc: 0.9465999603271484


In [None]:
# Test SWIM-ID vs DENSE-ID vs SWIM-DENSE 
# implement 'finding gradient direction' gradient boosting

# Test whether this is actually better than non-boost with same hidden size !!!!!!!!!!!!!!!!!!!!!