In [1]:
import numpy as np
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
import torchvision.transforms as transforms

import matplotlib.pyplot as plt
%matplotlib inline

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 1. Implement a PyTorch model with Adam on FashionMnist

There are many optimizers that employ adaptive learning rates to account for the different learning rate needs at different phases of training. Your job is to first implement such an optimizer and see its performance. For now, we are using

### a. Download Data FashionMnist

In [3]:
%%capture
transform_method = transforms.Compose([
    transforms.ToTensor()
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_method)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_method)

In [4]:
batch_size = 128
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

"""
for test_images, test_labels in train_loader:  
    sample_image = test_images[0]
    sample_label = test_labels[0]
    plt.imshow(sample_image[0], cmap='gray')
"""
""

''

### b. Implement a Logistic Regression Model and Fit the Data with Adam

In [5]:
def evaluate(model, test_loader, criterion):
    model.eval()
    

def train(model, train_loader, test_loader, optimizer, criterion, epochs):
    model.train()
    

## 2. Implement Adam with HD, and comprare it with Vanilla Adam

## 3. Compare MARTHE on Adam to AdamHD
Implement MARTHE. Use the learning rate scheduling to tune the learning rate of Adam, and compare the results to Adam with HD.

In [6]:
"""
import sys  
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    # Change the line below to include the directory you are currently in
    sys.path.append(module_path+"\\project\\problem set")
print(sys.path)"""

from adatune.data_loader import *
from adatune.mu_adam import MuAdam
from adatune.mu_sgd import MuSGD
from adatune.network import *
from adatune.utils import *

In [7]:
def train_rtho(network_name, dataset, num_epoch, batch_size, optim_name, lr, momentum, wd, hyper_lr, alpha,
               grad_clipping, first_order, seed, mu=1.0):
    torch.manual_seed(seed)

    # We are using cuda for training - no point trying out on CPU for ResNet
    device = torch.device("cuda")

    net = network(network_name, dataset)
    net.to(device).apply(init_weights)

    # assign argparse parameters
    criterion = nn.CrossEntropyLoss().to(device)
    best_val_accuracy = 0.0
    cur_lr = lr
    timestep = 0

    train_data, test_data = data_loader(network, dataset, batch_size)

    if optim_name == 'adam':
        optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=wd, eps=1e-4)
        hyper_optim = MuAdam(optimizer, hyper_lr, grad_clipping, first_order, mu, alpha, device)
    else:
        optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum, weight_decay=wd)
        hyper_optim = MuSGD(optimizer, hyper_lr, grad_clipping, first_order, mu, alpha, device)

    vg = ValidationGradient(test_data, nn.CrossEntropyLoss(), device)
    for epoch in range(num_epoch):
        train_correct = 0
        train_loss = 0

        for inputs, labels in train_data:
            net.train()
            timestep += 1

            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            train_loss += loss.item()

            train_pred = outputs.argmax(1)
            train_correct += train_pred.eq(labels).sum().item()

            first_grad = ag.grad(loss, net.parameters(), create_graph=True, retain_graph=True)

            hyper_optim.compute_hg(net, first_grad)

            for params, gradients in zip(net.parameters(), first_grad):
                params.grad = gradients

            optimizer.step()
            hyper_optim.hyper_step(vg.val_grad(net))
            clear_grad(net)

        train_acc = 100.0 * (train_correct / len(train_data.dataset))
        val_loss, val_acc = compute_loss_accuracy(net, test_data, criterion, device)

        if val_acc > best_val_accuracy:
            best_val_accuracy = val_acc

        print('train_accuracy at epoch :{} is : {}'.format(epoch, train_acc))
        print('val_accuracy at epoch :{} is : {}'.format(epoch, val_acc))
        print('best val_accuracy is : {}'.format(best_val_accuracy))

        cur_lr = 0.0
        for param_group in optimizer.param_groups:
            cur_lr = param_group['lr']
        print('learning_rate after epoch :{} is : {}'.format(epoch, cur_lr))

In [8]:
#train_with_marthe(model3, 0.001, train_loader, test_loader, 10)
train_rtho("resnet", "cifar_10", 10, 16, "adam", 0.0001, 0.9, 0, 0.0001, 1e-6, 100.0, False, 42)

Files already downloaded and verified


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 4.00 GiB total capacity; 914.10 MiB already allocated; 0 bytes free; 958.00 MiB reserved in total by PyTorch)

## 4. Task for Graduate Students
### a. Repeat task 1 and task 2 on VGG. Compare performance increase of AdamHD on the non-convex optimization problem when compared to Adam.
Comment on your findings and explain.