In [2]:
%load_ext autoreload
%autoreload 2

from optimizers.perturbed_gd import PerturbedGD
from optimizers.perturbed_agd import PerturbedAGD
from optimizers.cubic_reg import StochasticCubicRegularizedNewton
from models.nn import Net, SimpleNet

import torch
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, ToTensor, Normalize
from torch.nn import MSELoss
import torch.nn.functional as F
import torch.autograd as autograd
from copy import deepcopy
import numpy as np

from matplotlib import pyplot as plt


## Load the dataset

In [12]:
grad_batch_size = 1
hess_batch_size = 1

dataset_train = MNIST('data/',
                      train=True,
                      download=True,
                      transform=Compose([
                            ToTensor(),
                            Normalize((0.1307,), (0.3081,))
                            ]))
dataset_test = MNIST('data/',
          train=False,
          download=True,
          transform=Compose([ToTensor(),
                             Normalize((0.1307,), (0.3081,))
                             ]))

batch_size_test = len(dataset_test)

print("Train dataset size:",len(dataset_train))
print("Test dataset size:",len(dataset_test))
train_loader_grad = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=grad_batch_size, shuffle=True)

train_loader_hess = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=hess_batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(
    dataset_test,
  batch_size=batch_size_test, shuffle=True)

Train dataset size: 60000
Test dataset size: 10000


## Perturbed Gradient Descent

In [13]:

model_pgd = Net()
loss_fn_pgd = F.nll_loss

d = np.sum([p.numel() for p in Net().parameters()])

optimizer_pgd = PerturbedGD(model_pgd.parameters(),
                        l=10000,
                        rho=100,
                        epsilon=1e-5,
                        c=100,
                        delta=.1,
                        delta_f=10,
                        d=d)
iter_grad = iter(train_loader_grad)
test_data, test_labels = next(iter(test_loader))
batch_idx = 0
losses_pgd_test = []
losses_pgd_train = []
while True:
    if batch_idx == 40:
        break
        
    nextloss = loss_fn_pgd(model_pgd(test_data), test_labels)
    losses_pgd_test.append(nextloss)
    if optimizer_pgd._is_done:
        print("Optimizer has hit early stopping condition")
        break
    data_for_grad = next(iter_grad, None)
    if data_for_grad is None: 
        print("Exhausted training data -- finished optimization")
        break
    features_g_pgd, labels_g_pgd = data_for_grad
    def closure():
        optimizer_pgd.zero_grad()
        loss_g_pgd = loss_fn_pgd(model_pgd(features_g_pgd), labels_g_pgd)
        loss_g_pgd.backward()
        return loss_g_pgd
    
    train_loss = optimizer_pgd.step(closure)
    losses_pgd_train.append(train_loss)
    
    print("At batch:",batch_idx, "Test loss:",nextloss, "Train loss:",train_loss)
    batch_idx += 1


Using defaults
{   'chi': 126.68306919532642,
    'd': 21840,
    'eta': 0.01,
    'f_thresh': 1.5554077913827313e-13,
    'g_thresh': 6.2310731073563155e-09,
    'r': 6.231073107356315e-13,
    't_thresh': 4006.070396379457}
At batch: 0 Test loss: tensor(2.3129, grad_fn=<NllLossBackward>) Train loss: tensor(2.5996, grad_fn=<NllLossBackward>)
At batch: 1 Test loss: tensor(2.3076, grad_fn=<NllLossBackward>) Train loss: tensor(2.1930, grad_fn=<NllLossBackward>)
At batch: 2 Test loss: tensor(2.3093, grad_fn=<NllLossBackward>) Train loss: tensor(2.1429, grad_fn=<NllLossBackward>)
At batch: 3 Test loss: tensor(2.3092, grad_fn=<NllLossBackward>) Train loss: tensor(2.3448, grad_fn=<NllLossBackward>)
At batch: 4 Test loss: tensor(2.3086, grad_fn=<NllLossBackward>) Train loss: tensor(2.4384, grad_fn=<NllLossBackward>)
At batch: 5 Test loss: tensor(2.3085, grad_fn=<NllLossBackward>) Train loss: tensor(2.3086, grad_fn=<NllLossBackward>)
At batch: 6 Test loss: tensor(2.3072, grad_fn=<NllLossBackwa

## Perturbed Accelerated Gradient Descent

In [14]:

model_apgd = Net()
loss_fn_apgd = F.nll_loss

d = np.sum([p.numel() for p in Net().parameters()])

optimizer_apgd = PerturbedAGD(model_apgd.parameters(),
                        eta=0.01,
                        theta=0.1,
                        gamma=0.1,
                        s=1e5,
                        r=1e-5,
                        T=4000,
                        epsilon=1e-5,
                        add_noise=True,
                        neg_curv_explore=True)
iter_grad = iter(train_loader_grad)
test_data, test_labels = next(iter(test_loader))
batch_idx = 0
losses_apgd_test = []
losses_apgd_train = []
while True:
    if batch_idx == 40:
        break
        
    testloss = loss_fn_apgd(model_apgd(test_data), test_labels)
    losses_apgd_test.append(testloss)
    if optimizer_apgd._is_done:
        print("Optimizer has hit early stopping condition")
        break
    data_for_grad = next(iter_grad, None)
    if data_for_grad is None: 
        print("Exhausted training data -- finished optimization")
        break
    features_g_apgd, labels_g_apgd = data_for_grad
    def closure():
        optimizer_apgd.zero_grad()
        loss_g_apgd = loss_fn_apgd(model_apgd(features_g_apgd), labels_g_apgd)
        loss_g_apgd.backward()
        return loss_g_apgd
    
    train_loss = optimizer_apgd.step(closure)
    losses_apgd_train.append(train_loss)
    
    print("At batch:",batch_idx, "test loss:", testloss, "train loss:", train_loss)
    batch_idx += 1

Using defaults
{   'T': 4000,
    'add_noise': True,
    'epsilon': 1e-05,
    'eta': 0.01,
    'gamma': 0.1,
    'neg_curv_explore': True,
    'r': 1e-05,
    's': 100000.0,
    'theta': 0.1}
Exploring neg curv
At batch: 0 test loss: tensor(2.3121, grad_fn=<NllLossBackward>) train loss: tensor(2.2442, grad_fn=<NllLossBackward>)
Exploring neg curv


TypeError: 'NoneType' object is not iterable

## Stochastic Cubic Regularized Newton's method

In [None]:

model_scrn = Net()
loss_fn_scrn = F.nll_loss

optimizer = StochasticCubicRegularizedNewton(model_scrn.parameters(),
                                             l=100,
                                             rho=100,
                                             epsilon=1e-4,
                                             c_prime=1)


iter_grad = iter(train_loader_grad)
iter_hess = iter(train_loader_hess)
test_data, test_labels = next(iter(test_loader))
batch_idx = 0
losses_scrn_test = []
losses_scrn_train = []
while True:
    testloss = loss_fn_scrn(model_scrn(test_data), test_labels)
    losses_scrn_test.append(testloss)
    if optimizer._is_done:
        break
    optimizer.zero_grad()
    
    data_for_grad = next(iter_grad, None)
    data_for_hess = next(iter_hess, None)
    
    if data_for_grad is None or data_for_hess is None:
        print("Exhausted training data -- finished optimization")
        break
    features_g, labels_g = data_for_grad
    features_h, labels_h = data_for_hess
    
    loss_h = loss_fn_scrn(model_scrn(features_h), labels_h)
    
    flattened_grad_h = []
    for p in model_scrn.parameters():
        flattened_grad_h.append(autograd.grad(loss_h, p, create_graph=True)[0].view(-1))
    flattened_grad_h = torch.cat(flattened_grad_h)
    
    loss_g = loss_fn_scrn(model_scrn(features_g), labels_g)
    loss_g.backward()
    
    optimizer.step(flattened_grad_h)
    losses_scrn_train.append(loss_g)
    
    print("At batch:",batch_idx,"Test loss:",testloss, "Train loss:",loss_g)
    batch_idx += 1
    

In [None]:

# plt.plot(range(len(losses_scrn_test)), losses_scrn_test, 'ks-', label='StochasticCubic')
plt.plot(range(len(losses_pgd_test)), losses_pgd_test, 'b^-', label='PertGD')
plt.plot(range(len(losses_apgd_test)), losses_apgd_test, 'ro-', label='PertAGD')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel("Outer loop iteration number")
plt.ylabel("Test Loss")
plt.title("Batch size for gradient=%d and for hessian=%d"%(grad_batch_size, hess_batch_size))


In [None]:

# plt.plot(range(len(losses_scrn_train)), losses_scrn_train, 'ks-', label='StochasticCubic')
plt.plot(range(len(losses_pgd_train)), losses_pgd_train, 'b^-', label='PertGD')
plt.plot(range(len(losses_apgd_train)), losses_apgd_train, 'ro-', label='PertAGD')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel("Outer loop iteration number")
plt.ylabel("Train Loss")
plt.title("Batch size for gradient=%d and for hessian=%d"%(grad_batch_size, hess_batch_size))
