In [1]:
""" This notebook contains code for computing variance of gradients for RMSprop for 3 different learning rates.
Results are averaged over 3 random seeds to minimize the variation"""

%tensorflow_version 2.x
import tensorflow as tf
device = tf.test.gpu_device_name()
if device != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device))

from google.colab import drive
drive.mount('/content/drive')


%cd /content/drive/My Drive/Colab Notebooks

In [0]:
# Importing Libraries
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import argparse, copy

from model import *
from utils import *
from measures import *

In [0]:
lrs = [3e-6, 9e-6, 3e-5]
avg_train_losses_lrs, avg_test_losses_lrs, avg_difference_test_train_lrs, avg_var_grad_lrs  = [], [], [], []

for lr in lrs:
    print ("learning rate::", lr)

    avg_train_losses, avg_test_losses, avg_difference_test_train, avg_var_grad = [], [], [], []

    seeds = [12345, 1234, 123]
    for seed in seeds:
        print ("seed:::", seed)
        torch.manual_seed(seed)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Model Hyperparamters
        batch_size = 64
        epochs = 14
        lr = lr
        gamma = 0.7
        momentum = 0.9

        use_cuda = torch.cuda.is_available()
        train_loader, val_loader, test_loader, train_size, val_size = dataloaders(batch_size, use_cuda, seed)
        print (train_size, val_size, len(test_loader.dataset))

        model = Net().to(device)

        train_losses, test_losses = [], []
        var_grad_list = []

        optimizer = optim.RMSprop(model.parameters(), lr=lr, momentum=momentum)
        scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

        for epoch in range(1, epochs + 1):
            train_loss = train(model, device, train_loader, optimizer, epoch, batch_size)
            train_losses.append(train_loss)
            train_loss, grad_norms, grad_avg = train_get_grad(model, device, train_loader, optimizer, epoch, batch_size)
            test_loss = test(model, device, test_loader, batch_size)
            test_losses.append(test_loss)
            scheduler.step()

            # computing var_score over one epoch
            variance_score = compute_grad_variance(grad_norms, grad_avg)
            var_grad_list.append(variance_score)
        
        avg_train_losses.append(train_losses) 
        avg_test_losses.append(test_losses)
        avg_var_grad.append(var_grad_list)

        # print train and test losses and their difference
        difference_test_train = np.array(test_losses)  - np.array(train_losses)
        avg_difference_test_train.append(difference_test_train)
    
    avg_train_losses_lrs.append(avg_train_losses)
    avg_test_losses_lrs.append(avg_test_losses)
    avg_difference_test_train_lrs.append(avg_difference_test_train)
    avg_var_grad_lrs.append(avg_var_grad)



In [0]:
# For each learning rate, averaging over 3 random seeds

# for learning rate - 3e-6
avg_train_losses_lrs[0] = np.mean(np.array(avg_train_losses_lrs[0]), 0)
avg_test_losses_lrs[0] = np.mean(np.array(avg_test_losses_lrs[0]), 0)
avg_difference_test_train_lrs[0] = np.mean(np.array(avg_difference_test_train_lrs[0]), 0)
avg_var_grad_lrs[0] = np.mean(np.array(avg_var_grad_lrs[0]), 0)

# for learning rate - 9e-6
avg_train_losses_lrs[1] = np.mean(np.array(avg_train_losses_lrs[1]), 0)
avg_test_losses_lrs[1] = np.mean(np.array(avg_test_losses_lrs[1]), 0)
avg_difference_test_train_lrs[1] = np.mean(np.array(avg_difference_test_train_lrs[1]), 0)
avg_var_grad_lrs[1] = np.mean(np.array(avg_var_grad_lrs[1]), 0)


# for learning rate - 3e-5
avg_train_losses_lrs[2] = np.mean(np.array(avg_train_losses_lrs[2]), 0)
avg_test_losses_lrs[2] = np.mean(np.array(avg_test_losses_lrs[2]), 0)
avg_difference_test_train_lrs[2] = np.mean(np.array(avg_difference_test_train_lrs[2]), 0)
avg_var_grad_lrs[2] = np.mean(np.array(avg_var_grad_lrs[2]), 0)





In [0]:
print ("for learning rate - 3e-6::\n")
print ("avg_train_losses::\n", avg_train_losses_lrs[0])
print ("avg_test_losses::\n", avg_test_losses_lrs[0])
print ("avg_difference_test_train_lrs::\n", avg_difference_test_train_lrs[0])
print ("avg_var_grad::\n", avg_var_grad_lrs[0])

print ("for learning rate - 9e-6::\n")
print ("avg_train_losses::\n", avg_train_losses_lrs[1])
print ("avg_test_losses::\n", avg_test_losses_lrs[1])
print ("avg_difference_test_train_lrs::\n", avg_difference_test_train_lrs[1])
print ("avg_var_grad::\n", avg_var_grad_lrs[1])


print ("for learning rate - 3e-5::\n")
print ("avg_train_losses::\n", avg_train_losses_lrs[2])
print ("avg_test_losses::\n", avg_test_losses_lrs[2])
print ("avg_difference_test_train_lrs::\n", avg_difference_test_train_lrs[2])
print ("avg_var_grad::\n", avg_var_grad_lrs[2])

In [0]:
# Plotting
import numpy as np
plt.plot(np.arange(epochs), avg_var_grad_lrs[0], marker='o', color = "g", label = "lr = 3e-6")
plt.plot(np.arange(epochs), avg_var_grad_lrs[1], marker='o', color = "b", label = "lr = 9e-6")
plt.plot(np.arange(epochs), avg_var_grad_lrs[2], marker='o', color = "r", label = "lr = 3e-5")
plt.grid(True, linestyle='-.')
plt.title("Variance of Gradients - RMSPROP")
plt.ylabel("Variance of gradients")
plt.xlabel("Epochs")
plt.legend(loc='upper right')
plt.show()	  