In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from keras.utils import np_utils

plt.style.use("fivethirtyeight")
%matplotlib inline

In [None]:
# fetch data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

# model architecture 
n_dimensions = mnist.train.images[0].shape[0]
n_classes = mnist.train.labels[0].shape[0]

# adam hyperparams
alpha = 0.01  # step size
beta_1 = 0.9  # decay rate for 1st moment
beta_2 = 0.999  # decay rate for 2nd moment... beta2 < beta1, which means m_t will be more sensitive to changes in gradient direction
epsilon = 1e-8  # epsilon so we dont divide by zero

In [None]:
n_epochs = 1000
batch_size = 32
lr = 1e-2

In [None]:
tf.reset_default_graph()

with tf.variable_scope("inputs"):
    X_ = tf.placeholder(tf.float32, [None, n_dimensions], name="X")
    y_ = tf.placeholder(tf.float32, [None, n_classes], name="y")

with tf.variable_scope("model"):
    fc1 = tf.layers.dense(inputs=X_, units=n_classes, name='fc1', activation=tf.nn.relu)  
    
with tf.variable_scope('model/fc1', reuse=True):
    w1 = tf.get_variable('kernel')

probs = tf.nn.softmax(fc1)
loss = tf.losses.log_loss(labels=y_, predictions=probs)
grad = tf.gradients(loss, w1)[0]  # raw gradients of loss function wrt params

# initialize moments of gradients to zero
m_t = tf.Variable(tf.zeros([784, 10]), name="first_moment")
v_t = tf.Variable(tf.zeros([784, 10]), name="second_moment")

with tf.variable_scope("first_moment"):
    # calculate moving average of gradient's 1st moment - this acts as a momentum term
    # if the gradient points in the same direction, a momentum effect enlarges gradient
    m_t_update = m_t.assign(beta_1 * m_t + (1 - beta_1) * grad)

    # adjust for bias, which was introduced when m_t was initialized with zeros
    m_t_adjusted = m_t_update / (1 - beta_1)

with tf.variable_scope("second_moment"):
    # calculate moving average of gradient's 2nd moment
    g2 = grad * grad  # element wise multiplication of gradient
    v_t_update = v_t.assign(beta_2 * v_t + (1 - beta_2) * (g2))

    # adjust for bias, which was introduced when v_t was initialized with zeros
    v_t_adjusted = v_t_update / (1 - beta_2)

# when gradients from successiveminibatches point in different direction, m_t will approach zero
adam_update = m_t_adjusted/(tf.sqrt(v_t_adjusted) + epsilon)  # this is the paramater update vector
adam_step = w1.assign(w1 - lr * adam_update)
    
init = tf.global_variables_initializer()

In [None]:
ls_adam_loss_ = []
ls_first_ = []  # track first moment

with tf.Session() as sess:  
    sess.run(init)
    
    for epoch in range(n_epochs):
        batch_xs, batch_ys = mnist.train.next_batch(batch_size)
        
        # adam - calculate 1st and 2nd moments, 
        _, _ = sess.run([m_t_update, v_t_update], feed_dict={X_: batch_xs, y_: batch_ys})
        
        # then update parameter and calculate loss
        _, loss_, m_t_ = sess.run([adam_step, loss, adam_update], feed_dict={X_: batch_xs, y_: batch_ys})
        ls_adam_loss_.append(loss_)
        ls_first_.append(m_t_)
        
# pretty plot
rolling_plot = pd.Series(ls_adam_loss_).rolling(window=50).mean()
plt.plot(rolling_plot)
plt.xlabel("epochs")
plt.ylabel("loss")
plt.title("adam loss")

In [None]:
# inspect magnitude of momentum vector... 
ls_magnitude = [np.linalg.norm(x) for x in ls_first_]

rolling_plot = pd.Series(ls_magnitude).rolling(window=50).mean()
plt.plot(rolling_plot)
plt.xlabel("epochs")
plt.ylabel("magnitude of first moment of gradient")
plt.title("magnitude of gradient")