In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor, tensor
import pandas as pd
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18
from torch.utils.data import DataLoader
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeClassifierCVModule, E2EResNet, LogisticRegressionModule, RandFeatBoost

np.set_printoptions(precision=3, threshold=5) # Print options
device = "cuda" #torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load CIFAR-10 Dataset

In [2]:
def extract_features(data_loader, model, device):
    """Function to extract features from a dataset using pre-train model"""
    features = []
    labels = []
    with torch.no_grad():
        for images, target in tqdm(data_loader):
            images = images.to(device)
            output = model(images)
            output = output.view(output.size(0), -1)
            features.append(output.cpu().numpy())
            labels.append(target.cpu().numpy())
    
    features = np.concatenate(features, axis=0)
    labels = np.concatenate(labels, axis=0)
    return features, labels


def load_cifar10(
        data_path = "/home/nikita/hdd/cifar10/",
        train_name = "resnet18_train_features.csv",
        test_name = "resnet18_test_features.csv",
        ):
    """Loads a pretrained ResNet18 model and extracts features from the CIFAR-10 dataset"""
    # see if data has already been processed
    if train_name in os.listdir(data_path) and \
            test_name in os.listdir(data_path):
        print("Loading preprocessed data")
        train_df = pd.read_csv(data_path + train_name)
        test_df = pd.read_csv(data_path + test_name)
        return train_df.to_numpy(), test_df.to_numpy()

    # Define the DataLoaders and transformations for the CIFAR-10 dataset
    transform = transforms.Compose([
        transforms.Resize(224),  # Resize images to 224x224 as required by ResNet18
        transforms.ToTensor(),  # Convert images to PyTorch tensors
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # Normalize with ImageNet mean and std
    ])
    train_dataset = torchvision.datasets.CIFAR10(root=data_path, train=True, download=True, transform=transform)
    test_dataset = torchvision.datasets.CIFAR10(root=data_path, train=False, download=True, transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

    # Load the pre-trained ResNet18 model, remove classification head
    model = resnet18(weights=True)
    model = torch.nn.Sequential(*list(model.children())[:-1])
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Extract features for the training and test datasets
    train_features, train_labels = extract_features(train_loader, model, device)
    test_features, test_labels = extract_features(test_loader, model, device)

    # Create a DataFrame to store the features and labels, save to CSV
    train_df = pd.DataFrame(train_features)
    train_df['target'] = train_labels
    test_df = pd.DataFrame(test_features)
    test_df['target'] = test_labels
    train_df.to_csv(data_path + train_name, index=False)
    test_df.to_csv(data_path + test_name, index=False)
    return train_df.to_numpy(), test_df.to_numpy()

In [3]:
train, test = load_cifar10()

X_train = torch.from_numpy(train[:, :-1].astype(np.float32)).to(device)
y_train_cat = torch.from_numpy(train[:, -1].astype(np.int64)).to(device)
X_test = torch.from_numpy(test[:, :-1].astype(np.float32)).to(device)
y_test_cat = torch.from_numpy(test[:, -1].astype(np.int64)).to(device)

# Convert train and test labels to one-hot encoding
y_train = F.one_hot(y_train_cat, num_classes=10).float()
y_test = F.one_hot(y_test_cat, num_classes=10).float()
print(f"Train data shape: {X_train.shape}, dtype: {X_train.dtype}")
print(f"Train labels shape: {y_train.shape}, dtype: {y_train.dtype}")
print(f"Test data shape: {X_test.shape}, dtype: {X_test.dtype}")
print(f"Test labels shape: {y_test.shape}, dtype: {y_test.dtype}")

Loading preprocessed data
Train data shape: torch.Size([50000, 512]), dtype: torch.float32
Train labels shape: torch.Size([50000, 10]), dtype: torch.float32
Test data shape: torch.Size([10000, 512]), dtype: torch.float32
Test labels shape: torch.Size([10000, 10]), dtype: torch.float32


# experiments

In [4]:
def run_allmodels_1dataset(
        generator: torch.Generator,
        X_train: Tensor,
        y_train: Tensor,
        X_test: Tensor,
        y_test: Tensor,
        ):
    
    D = X_train.shape[1]
    hidden_size = 512
    bottleneck_dim = 1*hidden_size
    num_epochs = 40
    batch_size = 512
    adam_lr = 0.01
    
    # (name, model, kwargs). kwargs separate to save memory
    model_list = [
        ["T=10 RandFeatureBoost", RandFeatBoost,
                {"generator": generator,
                 "in_dim": D,
                 "hidden_size": hidden_size,
                 "out_dim": 10,
                 "n_blocks": 9,
                 "activation": nn.Tanh(),
                 "loss_fn": nn.CrossEntropyLoss(),
                 "adam_lr": adam_lr,
                 "boost_lr": 1.0,
                 "epochs": num_epochs,
                 "batch_size": batch_size,
                 "upscale_type": "SWIM",  # "dense", "identity"
                 }],

        ["Tabular RidgeClassifier", RidgeClassifierCVModule, {}],

        ["Logistic Regression", LogisticRegressionModule, 
                {"generator": generator,
                 "num_epochs": num_epochs,
                 "batch_size": batch_size,
                 "lr": adam_lr,
                 }],

        ["T=1 Dense", ResNet,
                {"generator": generator,
                 "in_dim": D,
                 "hidden_size": hidden_size,
                 "bottleneck_dim": None,
                 "n_blocks": 0,
                 "upsample_layer": "dense",
                 "output_layer": "logistic regression",
                 }],

        ["T=1 SWIM Grad", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": None,
                "n_blocks": 0,
                "upsample_layer": "SWIM",
                "output_layer": "logistic regression",
                }],
    ]

    for n_blocks in [3]:
        model_list += [
        [f"T={n_blocks+1} End2End", E2EResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "out_dim": 10,
                "n_blocks": n_blocks,
                "activation": nn.Tanh(),
                "loss": nn.CrossEntropyLoss(),
                "lr": adam_lr,
                "epochs": num_epochs,
                "batch_size": batch_size}
                ],

        [f"T={n_blocks+1} ResSWIM Grad-dense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "n_blocks": n_blocks,
                "upsample_layer": "SWIM",
                "res_layer1": "SWIM",
                "res_layer2": "dense",
                "output_layer": "logistic regression",
                }
                ],

        [f"T={n_blocks+1} ResSWIM Grad-id", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "SWIM",
                "res_layer1": "SWIM",
                "res_layer2": "identity",
                "output_layer": "logistic regression",
                }
                ],

        [f"T={n_blocks+1} ResDense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "dense",
                "res_layer1": "dense",
                "res_layer2": "identity",
                "output_layer": "logistic regression",
                }
                ],
    ]
    
    results = []
    model_names = []
    for name, model, model_args in model_list:
        print(name)
        t0 = time.perf_counter()
        model = model(**model_args).to(X_train.device)
        pred_train, _ = model.fit(X_train, y_train)
        t1 = time.perf_counter()
        pred_test = model(X_test)
        t2 = time.perf_counter()
        
        #convert to class predictions:
        if len(pred_train.shape) == 2:
            pred_train = torch.argmax(pred_train, dim=1)
            pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        result = np.array( [acc_train, acc_test, t1-t0, t2-t1] )
        results.append( result )
        model_names.append( name )

    return model_names, results



def run_all_experiments(
        name_save: str = "PLACEHOLDER",
        ):
    # Fetch and process each dataset
    experiments = {}
    generator = torch.Generator(device=device).manual_seed(999)
    results = run_allmodels_1dataset(
        generator, X_train, y_train, X_test, y_test, 
        )
    experiments["MNIST"] = results

    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["acc_train", "acc_test", "t_fit", "t_feat"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"cifar10_{name_save}.pkl")
    return df

In [5]:
run_all_experiments()

T=10 RandFeatureBoost


100%|██████████| 40/40 [00:24<00:00,  1.62it/s]
100%|██████████| 40/40 [00:23<00:00,  1.68it/s]
100%|██████████| 40/40 [00:23<00:00,  1.70it/s]
100%|██████████| 40/40 [00:22<00:00,  1.80it/s]
100%|██████████| 40/40 [00:22<00:00,  1.80it/s]
100%|██████████| 40/40 [00:22<00:00,  1.77it/s]
100%|██████████| 40/40 [00:22<00:00,  1.75it/s]
100%|██████████| 40/40 [00:22<00:00,  1.75it/s]
100%|██████████| 40/40 [00:22<00:00,  1.76it/s]


Tabular RidgeClassifier
Logistic Regression


100%|██████████| 40/40 [00:20<00:00,  1.96it/s]


T=1 Dense


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]


T=1 SWIM Grad


100%|██████████| 30/30 [00:15<00:00,  1.91it/s]


T=4 End2End


100%|██████████| 40/40 [00:30<00:00,  1.32it/s]


T=4 ResSWIM Grad-dense


100%|██████████| 30/30 [00:15<00:00,  1.96it/s]


T=4 ResSWIM Grad-id


100%|██████████| 30/30 [00:15<00:00,  1.98it/s]


T=4 ResDense


100%|██████████| 30/30 [00:15<00:00,  1.92it/s]

                 acc_test                                                \
      Logistic Regression T=1 Dense T=1 SWIM Grad T=10 RandFeatureBoost   
MNIST               0.854    0.8506         0.855                0.8578   

                                                                           \
      T=4 End2End T=4 ResDense T=4 ResSWIM Grad-dense T=4 ResSWIM Grad-id   
MNIST        0.87       0.8486                  0.835              0.8297   

                                        acc_train  ...  \
      Tabular RidgeClassifier Logistic Regression  ...   
MNIST                  0.8608             0.88682  ...   

                       t_feat               t_fit                           \
      Tabular RidgeClassifier Logistic Regression  T=1 Dense T=1 SWIM Grad   
MNIST                 0.03654           20.387619  15.102281     15.790536   

                                                                             \
      T=10 RandFeatureBoost T=4 End2End T=4 ResDense 




Unnamed: 0_level_0,acc_test,acc_test,acc_test,acc_test,acc_test,acc_test,acc_test,acc_test,acc_test,acc_train,...,t_feat,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit
Unnamed: 0_level_1,Logistic Regression,T=1 Dense,T=1 SWIM Grad,T=10 RandFeatureBoost,T=4 End2End,T=4 ResDense,T=4 ResSWIM Grad-dense,T=4 ResSWIM Grad-id,Tabular RidgeClassifier,Logistic Regression,...,Tabular RidgeClassifier,Logistic Regression,T=1 Dense,T=1 SWIM Grad,T=10 RandFeatureBoost,T=4 End2End,T=4 ResDense,T=4 ResSWIM Grad-dense,T=4 ResSWIM Grad-id,Tabular RidgeClassifier
MNIST,0.854,0.8506,0.855,0.8578,0.87,0.8486,0.835,0.8297,0.8608,0.88682,...,0.03654,20.387619,15.102281,15.790536,208.186397,30.303527,15.640562,15.60223,15.41781,8.700585


In [6]:
df = pd.read_pickle("cifar10_PLACEHOLDER.pkl")
df["acc_test"].mean().sort_values(ascending=False)

T=4 End2End                0.8700
Tabular RidgeClassifier    0.8608
T=10 RandFeatureBoost      0.8578
T=1 SWIM Grad              0.8550
Logistic Regression        0.8540
T=1 Dense                  0.8506
T=4 ResDense               0.8486
T=4 ResSWIM Grad-dense     0.8350
T=4 ResSWIM Grad-id        0.8297
dtype: float64

In [7]:
df["acc_train"].mean().sort_values(ascending=False)

T=4 End2End                0.99192
Logistic Regression        0.88682
T=10 RandFeatureBoost      0.87678
T=1 Dense                  0.87458
T=1 SWIM Grad              0.87114
T=4 ResDense               0.87020
Tabular RidgeClassifier    0.86964
T=4 ResSWIM Grad-dense     0.84318
T=4 ResSWIM Grad-id        0.83656
dtype: float64

In [None]:
# Time to experiment with RandFeatureBoost

generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 512
num_epochs = 50
batch_size = 512
n_blocks = 20
adam_lr=0.01
boost_lr=1.0

model = RandFeatBoost(
    generator=generator,
    in_dim=D,
    hidden_size=hidden_size,
    out_dim=10,
    n_blocks=n_blocks,
    activation=nn.Tanh(),
    loss_fn=nn.CrossEntropyLoss(),
    adam_lr=adam_lr,
    boost_lr=boost_lr,
    epochs=num_epochs,
    batch_size=batch_size,
    upscale_type="SWIM",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

In [None]:
print(acc_train)
print(acc_test)

In [None]:
model.deltas

In [None]:
torch.cuda.empty_cache()

In [None]:
# def batch_forward(model, X, batch_size=512):
#     model.eval()
#     with torch.no_grad():
#         pred = []
#         for i in range(0, X.shape[0], batch_size):
#             pred.append(model(X[i:i+batch_size]))
#         pred = torch.cat(pred, dim=0)
#     return pred

def print_all_accuracies(X_train, X_test):
    with torch.no_grad():
        X_train = model.upscale(X_train)
        X_test  = model.upscale(X_test)

        for t, (layer, DELTA, classifier) in enumerate(zip(model.layers, model.deltas, model.classifiers)):
            X_train = X_train + model.boost_lr * DELTA * (layer(X_train) - X_train)
            X_test = X_test + model.boost_lr * DELTA * (layer(X_test) - X_test)
            
            pred_train = classifier(X_train)
            pred_test = classifier(X_test)
            pred_train = torch.argmax(pred_train, dim=1)
            pred_test = torch.argmax(pred_test, dim=1)
            acc_train = (pred_train == y_train_cat).float().mean().item()
            acc_test = (pred_test == y_test_cat).float().mean().item()

            print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")


print_all_accuracies(X_train, X_test)

In [None]:

#lr = 0.05?
# Block 0: Train acc: 0.8704400062561035, Test acc: 0.8562999963760376
# Block 1: Train acc: 0.8750399947166443, Test acc: 0.8580999970436096
# Block 2: Train acc: 0.8756399750709534, Test acc: 0.8606999516487122
# Block 3: Train acc: 0.8766399621963501, Test acc: 0.8583999872207642
# Block 4: Train acc: 0.8761399984359741, Test acc: 0.859499990940094
# Block 5: Train acc: 0.8769800066947937, Test acc: 0.8606999516487122
# Block 6: Train acc: 0.8761999607086182, Test acc: 0.8598999977111816
# Block 7: Train acc: 0.877020001411438, Test acc: 0.8601999878883362
# Block 8: Train acc: 0.8753599524497986, Test acc: 0.8592999577522278
# Block 9: Train acc: 0.8775799870491028, Test acc: 0.85999995470047
# Block 10: Train acc: 0.8776399493217468, Test acc: 0.8608999848365784
# Block 11: Train acc: 0.8754799962043762, Test acc: 0.8572999835014343
# Block 12: Train acc: 0.8762399554252625, Test acc: 0.8598999977111816
# Block 13: Train acc: 0.8745200037956238, Test acc: 0.8574000000953674
# Block 14: Train acc: 0.8747199773788452, Test acc: 0.8578000068664551
# Block 15: Train acc: 0.8781999945640564, Test acc: 0.8606999516487122
# Block 16: Train acc: 0.8761000037193298, Test acc: 0.859499990940094
# Block 17: Train acc: 0.8755199909210205, Test acc: 0.8597999811172485
# Block 18: Train acc: 0.8772199749946594, Test acc: 0.8621000051498413
# Block 19: Train acc: 0.8749600052833557, Test acc: 0.85999995470047

In [None]:
# Test SWIM-ID vs DENSE-ID vs SWIM-DENSE 
# implement 'finding gradient direction' gradient boosting

# Test whether this is actually better than non-boost with same hidden size !!!!!!!!!!!!!!!!!!!!!

In [8]:
# experiment with DENSE

generator = torch.Generator(device=device).manual_seed(999)
D = X_train.shape[1]
hidden_size = 512
num_epochs = 50
batch_size = 512
n_blocks = 20
adam_lr=0.01
boost_lr=1.0

model = RandFeatBoost(
    generator=generator,
    in_dim=D,
    hidden_size=hidden_size,
    out_dim=10,
    n_blocks=n_blocks,
    activation=nn.Tanh(),
    loss_fn=nn.CrossEntropyLoss(),
    adam_lr=adam_lr,
    boost_lr=boost_lr,
    epochs=num_epochs,
    batch_size=batch_size,
    upscale_type="dense",
    second_in_resblock="identity",
    ).to(device)

pred_train, _ = model.fit(X_train, y_train)
pred_test = model(X_test)
pred_train = torch.argmax(pred_train, dim=1)
pred_test = torch.argmax(pred_test, dim=1)
acc_train = (pred_train == y_train_cat).float().mean().item()
acc_test = (pred_test == y_test_cat).float().mean().item()

100%|██████████| 50/50 [00:27<00:00,  1.79it/s]
100%|██████████| 50/50 [00:28<00:00,  1.75it/s]
100%|██████████| 50/50 [00:27<00:00,  1.83it/s]
100%|██████████| 50/50 [00:28<00:00,  1.73it/s]
100%|██████████| 50/50 [00:28<00:00,  1.78it/s]
100%|██████████| 50/50 [00:28<00:00,  1.72it/s]
100%|██████████| 50/50 [00:28<00:00,  1.77it/s]
100%|██████████| 50/50 [00:28<00:00,  1.74it/s]
100%|██████████| 50/50 [00:27<00:00,  1.84it/s]
100%|██████████| 50/50 [00:27<00:00,  1.84it/s]
100%|██████████| 50/50 [00:27<00:00,  1.82it/s]
100%|██████████| 50/50 [00:27<00:00,  1.84it/s]
100%|██████████| 50/50 [00:26<00:00,  1.85it/s]
100%|██████████| 50/50 [00:26<00:00,  1.86it/s]
100%|██████████| 50/50 [00:27<00:00,  1.83it/s]
100%|██████████| 50/50 [00:27<00:00,  1.85it/s]
100%|██████████| 50/50 [00:27<00:00,  1.81it/s]
100%|██████████| 50/50 [00:27<00:00,  1.85it/s]
100%|██████████| 50/50 [00:27<00:00,  1.83it/s]
100%|██████████| 50/50 [00:27<00:00,  1.84it/s]


In [9]:
def print_all_accuracies(X_train, X_test):
    X_train = model.upscale(X_train)
    X_test  = model.upscale(X_test)

    for t, (layer, DELTA, classifier) in enumerate(zip(model.layers, model.deltas, model.classifiers)):
        X_train = X_train + model.boost_lr * DELTA * (layer(X_train) - X_train)
        X_test = X_test + model.boost_lr * DELTA * (layer(X_test) - X_test)
        
        pred_train = classifier(X_train)
        pred_test = classifier(X_test)
        pred_train = torch.argmax(pred_train, dim=1)
        pred_test = torch.argmax(pred_test, dim=1)
        acc_train = (pred_train == y_train_cat).float().mean().item()
        acc_test = (pred_test == y_test_cat).float().mean().item()

        print(f"Block {t}: Train acc: {acc_train}, Test acc: {acc_test}")

print_all_accuracies(X_train, X_test)

Block 0: Train acc: 0.8825199604034424, Test acc: 0.8538999557495117
Block 1: Train acc: 0.8873800039291382, Test acc: 0.8561999797821045
Block 2: Train acc: 0.8834799528121948, Test acc: 0.849299967288971
Block 3: Train acc: 0.8905400037765503, Test acc: 0.8593999743461609
Block 4: Train acc: 0.884119987487793, Test acc: 0.8543999791145325
Block 5: Train acc: 0.891819953918457, Test acc: 0.8586999773979187
Block 6: Train acc: 0.8880800008773804, Test acc: 0.8535999655723572
Block 7: Train acc: 0.8888799548149109, Test acc: 0.8560999631881714
Block 8: Train acc: 0.8908199667930603, Test acc: 0.85589998960495
Block 9: Train acc: 0.8820199966430664, Test acc: 0.851099967956543
Block 10: Train acc: 0.876800000667572, Test acc: 0.8470999598503113
Block 11: Train acc: 0.884939968585968, Test acc: 0.85589998960495
Block 12: Train acc: 0.8805199861526489, Test acc: 0.8496999740600586
Block 13: Train acc: 0.8872399926185608, Test acc: 0.8531000018119812
Block 14: Train acc: 0.88673996925354, T