In [1]:
import os
import shutil
import zipfile
import urllib.request


def download_repo(url, save_to):
    zip_filename = save_to + '.zip'
    urllib.request.urlretrieve(url, zip_filename)
    
    if os.path.exists(save_to):
        shutil.rmtree(save_to)
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall('.')
    del zip_ref
    assert os.path.exists(save_to)

In [2]:
REPO_PATH = 'LinearizedNNs-master'

download_repo(url='https://github.com/maxkvant/LinearizedNNs/archive/master.zip',
              save_to=REPO_PATH)

In [3]:
import sys
sys.path.append(f"{REPO_PATH}/src")

In [4]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torchvision import transforms, datasets
from torchvision.datasets import FashionMNIST

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.decomposition import PCA

from pytorch_impl.nns import ResNet, FCN, CNN
from pytorch_impl.nns import warm_up_batch_norm
from pytorch_impl.estimators import LinearizedSgdEstimator, SgdEstimator, MatrixExpEstimator, GradientBoostingEstimator
from pytorch_impl import ClassifierTraining
from pytorch_impl.matrix_exp import matrix_exp, compute_exp_term
from pytorch_impl.nns.utils import to_one_hot

In [5]:
device = torch.device('cuda:0') if (torch.cuda.is_available()) else torch.device('cpu')
num_classes = 10
device

device(type='cuda', index=0)

In [6]:
# compute M^-1 * (exp(M) - E)
def compute_exp_term(M, device, n_iter=3):
    with torch.no_grad():
        M = M.double().to(device)

        n = M.size()[0]
        norm = torch.sqrt((M ** 2).sum())
        steps = 0
        while norm > 1e-9:
            M /= 2.
            norm /= 2.
            steps += 1

        series_sum = torch.zeros([n, n]).double().to(device)
        prod = torch.eye(n).double().to(device)

        # series_sum: E + M / 2 + M^2 / 6 + ...
        for i in range(1, n_iter):
            series_sum = (series_sum + prod)
            prod = torch.matmul(prod, M) / (i + 1)

        # (exp 0) (exp 0) = (exp^2           0)
        # (sum E) (sum E) = (sum * exp + sum E)
        exp = torch.matmul(M, series_sum) + torch.eye(n).to(device)
        for step in range(steps):
            series_sum = (torch.matmul(series_sum, exp) + series_sum) / 2.
            exp = torch.matmul(exp, exp)

        return series_sum

In [7]:
kernels_12k = np.load('../data/kernels_12k.npz')

train_kernel = kernels_12k['train_kernel']
test_kernel  = kernels_12k['test_kernel']
labels_train = kernels_12k['labels_train']
labels_test  = kernels_12k['labels_test']

train_kernel.shape, test_kernel.shape, labels_train.shape, labels_test.shape

((12800, 12800), (10000, 12800), (12800,), (10000,))

In [8]:
train_kernel = torch.from_numpy(train_kernel).float().to(device)
test_kernel  = torch.from_numpy(test_kernel).float().to(device)

labels_test  = torch.from_numpy(labels_test).to(device)
labels_train = torch.from_numpy(labels_train).to(device)

In [9]:
y_train = to_one_hot(labels_train, num_classes).to(device)

In [10]:
lr = 1e4

n = len(train_kernel)
reg = 0e-4 * torch.eye(n).to(device)
exp_term = - lr * compute_exp_term(- lr * (train_kernel + reg), device).float()

y_pred = torch.matmul(test_kernel, torch.matmul(exp_term, - y_train))

del exp_term

(y_pred.argmax(dim=1) == labels_test).float().mean()

tensor(0.8012, device='cuda:0')

In [11]:
del train_kernel
del test_kernel

In [12]:
def matmul_via_torch(numpy_matrix, torch_matrix, step=2048):
    with torch.no_grad():
        n,  m = numpy_matrix.shape
        m2, k = torch_matrix.size()
        assert m2 == m
        
        to_torch = lambda matrix: torch.from_numpy(matrix).double().to(device)
        
        result = torch.zeros([n, k]).to(device)
        for l in range(0, n, step):
            r = min(l + step, n)
            result[l:r] = torch.matmul(to_torch(numpy_matrix[l:r]), torch_matrix.double())
        return result

In [30]:
def boosting(train_kernel, y_train, labels_train, test_kernel, labels_test, beta=1.4, n_iter=24, lr=1e5, flips=False, block_size = 1280 * 2):
    with torch.no_grad():
        n = len(train_kernel)

        right_vector = torch.zeros([n, num_classes]).double().to(device)
        # right_vector.normal_()
        # right_vector /= np.sqrt(n)
        
        n_actual = (n // 2) if (flips) else n 
        
        n_blocks = (2 * n_actual) // (3 * block_size) + 1
        print(n_blocks)

        for iter_num in range(n_iter):
            index = torch.randperm(n_actual).to(device)
            
            if flips:
                index += n_actual * rand_bool(n_actual)
                
            y_pred_train = matmul_via_torch(train_kernel, right_vector)
            y_pred_test  = matmul_via_torch(test_kernel, right_vector)
            train_acc = (y_pred_train.argmax(dim=1) == labels_train).float().mean().item()
            test_acc  = (y_pred_test.argmax(dim=1)  == labels_test).float().mean().item()

            y_residual     = y_pred_train - y_train
            
            train_mse = (y_residual ** 2).sum(dim=1).mean().item()
            
            print(f"iteration {iter_num} train_acc {train_acc} test_acc {test_acc} train_mse {train_mse}")

            d_right_vector = torch.zeros([n, num_classes]).double().to(device)
            
            for i in range(n_blocks):
                batch_index = index[i * block_size: (i + 1) * block_size]
                batch_index_np = batch_index.cpu().numpy()
                
                K = train_kernel[batch_index_np][:, batch_index_np]
                K = torch.from_numpy(K).double().to(device)
                
                K = K + 1e-4 * torch.eye(len(K)).double().to(device)
                
                exp_term = - lr * compute_exp_term(- lr * K, device)
                d_right_vector[batch_index] = torch.matmul(exp_term, y_residual[batch_index].double()) / n_blocks
                
            pred_change = matmul_via_torch(train_kernel, d_right_vector)
            right_vector += d_right_vector * beta
            
            print(f"batches {0}-{n_blocks - 1} done")
            print(f"beta = {beta}")
            print()
            
        y_pred_train = matmul_via_torch(train_kernel, right_vector)
        y_pred_test  = matmul_via_torch(test_kernel, right_vector)
        train_acc = (y_pred_train.argmax(dim=1) == labels_train).float().mean().item()
        test_acc  = (y_pred_test.argmax(dim=1)  == labels_test).float().mean().item()

        y_residual     = y_pred_train - to_one_hot(labels_train, num_classes).double().to(device)

        train_mse = (y_residual ** 2).sum(dim=1).mean().item()

        print(f"iteration {n_iter} train_acc {train_acc:.4f} test_acc {test_acc:.4f} train_mse {train_mse}")

In [19]:
%%time

kernels_50k = np.load('../data/kernels_50k.npz')

train_kernel = kernels_50k['train_kernel']
test_kernel  = kernels_50k['test_kernel']
labels_train = kernels_50k['labels_train']
labels_test  = kernels_50k['labels_test']

labels_train = torch.from_numpy(labels_train).to(device)
labels_test  = torch.from_numpy(labels_test).to(device)
y_train = to_one_hot(labels_train, num_classes).to(device)

CPU times: user 32.6 s, sys: 37.5 s, total: 1min 10s
Wall time: 15min 46s


In [22]:
y_train = to_one_hot(labels_train, num_classes).to(device)

In [24]:
%time

boosting(train_kernel, y_train, labels_train, test_kernel, labels_test, beta=1.4, lr=1e5, n_iter=100)

CPU times: user 0 ns, sys: 4 µs, total: 4 µs
Wall time: 7.39 µs
14
iteration 0 train_acc 0.09999999403953552 test_acc 0.09999999403953552 train_mse 10.0
batches 0-13 done
beta = 1.4

iteration 1 train_acc 0.8350799679756165 test_acc 0.7719999551773071 train_mse 2.7111761569976807
batches 0-13 done
beta = 1.4

iteration 2 train_acc 0.9254599809646606 test_acc 0.8180999755859375 train_mse 1.4347137212753296
batches 0-13 done
beta = 1.4

iteration 3 train_acc 0.944599986076355 test_acc 0.8287000060081482 train_mse 1.1048145294189453
batches 0-13 done
beta = 1.4

iteration 4 train_acc 0.9664999842643738 test_acc 0.838699996471405 train_mse 0.9468210935592651
batches 0-13 done
beta = 1.4

iteration 5 train_acc 0.9775599837303162 test_acc 0.842799961566925 train_mse 0.8372856974601746
batches 0-13 done
beta = 1.4

iteration 6 train_acc 0.9856599569320679 test_acc 0.8481000065803528 train_mse 0.7488650679588318
batches 0-13 done
beta = 1.4

iteration 7 train_acc 0.9906999468803406 test_acc 0.

In [26]:
%%time

kernels_myrtle10 = np.load('../data/myrtle10_kernels.npz')

train_kernel = kernels_myrtle10['train_kernel']
test_kernel  = kernels_myrtle10['test_kernel']
labels_train = kernels_myrtle10['labels_train']
labels_test  = kernels_myrtle10['labels_test']

labels_train = torch.from_numpy(labels_train).to(device)
labels_test  = torch.from_numpy(labels_test).to(device)
y_train = to_one_hot(labels_train, num_classes).to(device)

CPU times: user 28.4 s, sys: 42.4 s, total: 1min 10s
Wall time: 14min 17s


In [27]:
train_kernel[:4,:4]

array([[0.99999999, 0.99621984, 0.99376137, 0.99705303],
       [0.99621984, 0.99999993, 0.99273713, 0.99698744],
       [0.99376137, 0.99273713, 0.99999995, 0.9928886 ],
       [0.99705303, 0.99698744, 0.9928886 , 0.99999996]])

In [38]:
%%time

boosting(train_kernel, y_train, labels_train, test_kernel, labels_test, beta=1, lr=1e5, n_iter=128, block_size=2 * 1280)

14
iteration 0 train_acc 0.09999999403953552 test_acc 0.09650000184774399 train_mse 10.0
batches 0-13 done
beta = 1

iteration 1 train_acc 0.8500399589538574 test_acc 0.7765000462532043 train_mse 1.6379437446594238
batches 0-13 done
beta = 1

iteration 2 train_acc 0.9042399525642395 test_acc 0.8055000305175781 train_mse 1.3548200130462646
batches 0-13 done
beta = 1

iteration 3 train_acc 0.9343799948692322 test_acc 0.8195000290870667 train_mse 1.1855255365371704
batches 0-13 done
beta = 1

iteration 4 train_acc 0.9528999924659729 test_acc 0.8255000114440918 train_mse 1.061790108680725
batches 0-13 done
beta = 1

iteration 5 train_acc 0.9666599631309509 test_acc 0.8335000276565552 train_mse 0.9627645015716553
batches 0-13 done
beta = 1

iteration 6 train_acc 0.9761599898338318 test_acc 0.8350000381469727 train_mse 0.880385160446167
batches 0-13 done
beta = 1

iteration 7 train_acc 0.9825399518013 test_acc 0.8390000462532043 train_mse 0.8100934028625488
batches 0-13 done
beta = 1

iterat

KeyboardInterrupt: 

In [42]:
%%time

kernels_myrtle7 = np.load('../data/myrtle7_kernels.npz')

train_kernel = kernels_myrtle7['train_kernel']
test_kernel  = kernels_myrtle7['test_kernel']
labels_train = kernels_myrtle7['labels_train']
labels_test  = kernels_myrtle7['labels_test']

labels_train = torch.from_numpy(labels_train).to(device)
labels_test  = torch.from_numpy(labels_test).to(device)
y_train = to_one_hot(labels_train, num_classes).to(device)

CPU times: user 31.1 s, sys: 21 s, total: 52 s
Wall time: 15min 20s


In [43]:
train_kernel[:4,:4]

array([[0.99999996, 0.99566936, 0.99371328, 0.99647623],
       [0.99566936, 1.        , 0.99317682, 0.99668258],
       [0.99371328, 0.99317682, 0.99999998, 0.99330106],
       [0.99647623, 0.99668258, 0.99330106, 0.99999994]])

In [44]:
%%time

boosting(train_kernel, y_train, labels_train, test_kernel, labels_test, beta=1, lr=1e5, n_iter=128, block_size=2 * 1280)

14
iteration 0 train_acc 0.09999999403953552 test_acc 0.09650000184774399 train_mse 10.0
batches 0-13 done
beta = 1

iteration 1 train_acc 0.8427799940109253 test_acc 0.7780000567436218 train_mse 1.6839958429336548
batches 0-13 done
beta = 1

iteration 2 train_acc 0.8982399702072144 test_acc 0.796500027179718 train_mse 1.4002586603164673
batches 0-13 done
beta = 1

iteration 3 train_acc 0.9285399913787842 test_acc 0.8105000257492065 train_mse 1.2289040088653564
batches 0-13 done
beta = 1

iteration 4 train_acc 0.9489599466323853 test_acc 0.8230000138282776 train_mse 1.1028614044189453
batches 0-13 done
beta = 1

iteration 5 train_acc 0.9634999632835388 test_acc 0.8255000114440918 train_mse 1.0025246143341064
batches 0-13 done
beta = 1

iteration 6 train_acc 0.9734799861907959 test_acc 0.8225000500679016 train_mse 0.9193209409713745
batches 0-13 done
beta = 1

iteration 7 train_acc 0.9807599782943726 test_acc 0.8315000534057617 train_mse 0.8478538990020752
batches 0-13 done
beta = 1

it