### Train optimizer
Notebook to train different optimizer to compare

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import os
import sys
import time
import torch
import torch.optim as optim
import torch.nn as nn
from tqdm.notebook import trange, tqdm

In [3]:
# Run this cell only if you run this notebook on Google Colab.
from google.colab import drive
drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/Colab_Notebooks/OptiML/OptiForML2022')

Mounted at /content/drive


In [4]:
from optimizer.AdaHessian import AdaHessian
from optimizer.Atmo import Atmo, MASScheduler
from optimizer.AdaSGD import AdaSGD, AdaSGDscheduler
from dataset import ImagesDataset
from model import ResNet18
from path import TRAIN_HISTORY_DIR, TRAIN_MODEL_DIR

In [5]:
# choose dataset
#data_name = "MNIST"
data_name = "cifar"
full = True
tiny = False

# choose optimizer
optimizer_name = "adam"
optimizer_name = "sgd"
optimizer_name = "atmo"
optimizer_name = "dynamicAtom"
optimizer_name = "adaHessian"
optimizer_name = "adaSGD"

# choose scheduler
scheduler_name = None
#scheduler_name = "cosineAnnealinglr"
#scheduler_name = "steplr"
#scheduler_name = "multiSteplr"

# choose nb of epochs
epochs = 100
#if tiny:
#  epochs = 50
#else:
#  epochs = 10

In [6]:
# load dataset
train_dataset = ImagesDataset(full = full, tiny=tiny, cifar=(data_name=="cifar"))
test_dataset = ImagesDataset(full = full, tiny=tiny, cifar=(data_name=="cifar"), test=True)


trainDataLoader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True)
testDataLoader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=True)

* Using CIFAR
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar10/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar10/cifar-10-python.tar.gz to ./data/cifar10/
** Use 50000 train samples
* Using CIFAR
Files already downloaded and verified
** Use 10000 test samples


In [7]:
# load model
model = ResNet18(in_channel=1 if data_name=="MNIST" else 3)
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)
model.train()
model.to(device)
#model.load_state_dict(torch.load(os.path.join(TRAIN_MODEL_DIR, f"{optimizer_name}_{scheduler_name}_{epochs}epochs_{data_name}.pt")))

dynamic = None

# load optimizer
if optimizer_name == "adam":
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
if optimizer_name == "sgd":
    optimizer = optim.SGD(model.parameters(), lr=1e-3)
if optimizer_name == "atmo":
    optimizer = Atmo(model.parameters())
if optimizer_name == "dynamicAtom":
    optimizer = Atmo(model.parameters(), adam_w=1, sgd_w=0)
    dynamic = MASScheduler(optimizer, epochs = epochs)
if optimizer_name == "adaHessian":
    optimizer = AdaHessian(model.parameters(), lr = 0.15)
if optimizer_name == "adaSGD":
    optimizer = AdaSGD(model.parameters(), lr = 0.1, ada_w = 1, sgd_w = 0)
    dynamic = AdaSGDscheduler(optimizer, epochs = 60) # do only 100 epochs with both optim then only sgd

# load scheduler
if scheduler_name is not None:
    if scheduler_name == "cosineAnnealinglr":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs, verbose=True)
    if scheduler_name == "steplr":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=150, verbose=True)
    if scheduler_name == "multiSteplr":
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones = [80, 120], gamma = 0.1, verbose = True)



criterion = nn.CrossEntropyLoss()
#with open(os.path.join(TRAIN_HISTORY_DIR, f"log-{optimizer_name}_{scheduler_name}_{epochs}epochs_{data_name}.pickle"), 'rb') as f:
#    history = pickle.load(f)

cuda


In [8]:
train_loss = []#history['train_loss']
train_acc = [] #history['train_acc']
test_loss = [] #history['test_loss']
test_acc = []  #history['test_acc']
time_epoch = [] #history['time_epoch']
epochs_done = 1 #len(train_loss)

with trange(epochs_done, epochs + 1, desc='Training', unit='epoch') as t:
  for epoch in t:
    losses = []
    acc = []
    start_time = time.time()
    with tqdm(trainDataLoader, desc=f'Train epoch {epoch}',
              unit='batch', leave=False) as t1:
      for x_train, y_train in t1:
        x_train = x_train.to(device)
        y_train = y_train.to(device)

        optimizer.zero_grad()
        output = model(x_train)
        loss = criterion(output, y_train)
        loss.backward(create_graph=(optimizer_name=="adaHessian" or optimizer_name == 'adaSGD'))
        optimizer.step()

        losses.append(loss)
        pred = torch.argmax(output, axis = 1)
        acc.append(sum(pred == y_train).item()/pred.shape[0])

    if dynamic is not None:
        dynamic.step()

    train_loss.append(sum(losses)/len(losses))
    train_acc.append(sum(acc)/len(acc))

    losses = []
    acc = []
    with torch.no_grad():
      with tqdm(testDataLoader, desc=f'Test epoch {epoch}',
                unit='batch', leave=False) as t1:
        for x_test, y_test in t1:
          x_test = x_test.to(device)
          y_test = y_test.to(device)

          output = model(x_test)
          loss = criterion(output, y_test)
          losses.append(loss)

          pred = torch.argmax(output, axis = 1)
          acc.append(sum(pred == y_test).item()/pred.shape[0])

      test_loss.append(sum(losses)/len(losses))
      test_acc.append(sum(acc)/len(acc))

      end_time = time.time()
      time_epoch.append(end_time-start_time)
    if scheduler_name is not None:
          scheduler.step()
    history = dict()
    history["train_loss"] = train_loss
    history["train_acc"] = train_acc
    history["test_loss"] = test_loss
    history["test_acc"] = test_acc
    history["time_epoch"] = time_epoch
    history["data"] = data_name
    history["model"] = "resnet18"
    history["epochs"] = epochs
    history["optimizer"] = optimizer_name
    #with open(os.path.join(TRAIN_HISTORY_DIR, f"log-{optimizer_name}_{scheduler_name}_{epochs}epochs_{data_name}.pickle"), 'wb') as f:
    #  pickle.dump(history, f)
    #weights = model.state_dict()
    #torch.save(weights,os.path.join(TRAIN_MODEL_DIR, f"{optimizer_name}_{scheduler_name}_{epochs}epochs_{data_name}.pt"))
#mean by epoch
time_epoch = sum(time_epoch)/len(time_epoch)

Training:   0%|          | 0/100 [00:00<?, ?epoch/s]

Train epoch 1:   0%|          | 0/196 [00:00<?, ?batch/s]

  allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass


Test epoch 1:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 2:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 2:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 3:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 3:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 4:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 4:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 5:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 5:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 6:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 6:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 7:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 7:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 8:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 8:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 9:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 9:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 10:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 10:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 11:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 11:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 12:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 12:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 13:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 13:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 14:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 14:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 15:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 15:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 16:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 16:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 17:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 17:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 18:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 18:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 19:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 19:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 20:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 20:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 21:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 21:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 22:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 22:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 23:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 23:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 24:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 24:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 25:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 25:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 26:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 26:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 27:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 27:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 28:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 28:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 29:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 29:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 30:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 30:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 31:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 31:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 32:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 32:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 33:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 33:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 34:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 34:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 35:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 35:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 36:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 36:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 37:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 37:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 38:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 38:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 39:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 39:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 40:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 40:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 41:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 41:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 42:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 42:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 43:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 43:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 44:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 44:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 45:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 45:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 46:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 46:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 47:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 47:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 48:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 48:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 49:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 49:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 50:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 50:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 51:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 51:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 52:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 52:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 53:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 53:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 54:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 54:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 55:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 55:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 56:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 56:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 57:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 57:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 58:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 58:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 59:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 59:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 60:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 60:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 61:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 61:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 62:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 62:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 63:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 63:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 64:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 64:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 65:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 65:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 66:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 66:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 67:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 67:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 68:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 68:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 69:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 69:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 70:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 70:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 71:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 71:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 72:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 72:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 73:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 73:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 74:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 74:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 75:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 75:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 76:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 76:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 77:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 77:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 78:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 78:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 79:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 79:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 80:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 80:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 81:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 81:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 82:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 82:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 83:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 83:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 84:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 84:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 85:   0%|          | 0/196 [00:00<?, ?batch/s]

Test epoch 85:   0%|          | 0/40 [00:00<?, ?batch/s]

Train epoch 86:   0%|          | 0/196 [00:00<?, ?batch/s]

KeyboardInterrupt: ignored

In [9]:
print(max(history['test_acc']), min(history['test_acc']))

0.86611328125 0.57626953125


In [None]:
# run this cell to save history
history = dict()
history["train_loss"] = train_loss
history["train_acc"] = train_acc
history["test_loss"] = test_loss
history["test_acc"] = test_acc
history["time_epoch"] = time_epoch
history["data"] = data_name
history["model"] = "resnet18"
history["epochs"] = epochs
history["optimizer"] = optimizer_name
with open(os.path.join(TRAIN_HISTORY_DIR, f"log-{optimizer_name}_{epochs}epochs_{data_name}.pickle"), 'wb') as f:
  pickle.dump(history, f)

In [None]:
# run this cell to save model
weights = model.state_dict()
torch.save(weights,os.path.join(TRAIN_MODEL_DIR, f"{optimizer_name}_{epochs}epochs_{data_name}.pt"))