In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from keras.utils import np_utils

plt.style.use("ggplot")
%matplotlib inline

In [None]:
# fetch data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

# model architecture 
n_dimensions = mnist.train.images[0].shape[0]
n_classes = mnist.train.labels[0].shape[0]

# hyperparams
beta = 0.9  # decay rate - higher values result in slower decay rates
n_epochs = 1000
batch_size = 32
lr = 1e-3  # step size - we want to take smaller steps as model starts to converge
epsilon = 1e-8  # epsilon so we dont divide by zero

In [None]:
tf.reset_default_graph()

with tf.variable_scope("inputs"):
    X_ = tf.placeholder(tf.float32, [None, n_dimensions], name="X")
    y_ = tf.placeholder(tf.float32, [None, n_classes], name="y")

with tf.variable_scope("model"):
    fc1 = tf.layers.dense(inputs=X_, units=n_classes, name='fc1', activation=tf.nn.relu)  
    
with tf.variable_scope('model/fc1', reuse=True):
    w = tf.get_variable('kernel')

probs = tf.nn.softmax(fc1)
loss = tf.losses.log_loss(labels=y_, predictions=probs)
grad = tf.gradients(loss, w)[0]  # raw gradients of loss function wrt params

with tf.variable_scope("rms"):
    v = tf.Variable(tf.zeros([784, 10]), name="squared_gradients")  # v represents squared gradients
    g2 = grad * grad  # element wise multiplication
    update_v = v.assign(beta * v + (1 - beta) * (g2))  # mixture of old and new squared gradients

rms_update = grad/(tf.sqrt(v + epsilon))  # current gradient relative to moving average
update_w = w.assign(w - lr * rms_update)
    
init = tf.global_variables_initializer()

In [None]:
ls_loss_ = []

with tf.Session() as sess:  
    sess.run(init)
    
    for epoch in tqdm(range(n_epochs)):
        batch_xs, batch_ys = mnist.train.next_batch(batch_size)
        
        # first, update exponential average of squares of gradient
        _ = sess.run(update_v, feed_dict={X_: batch_xs, y_: batch_ys})
        
        # then, step and compute loss
        _, loss_, step_ = sess.run([update_w, loss, rms_update], feed_dict={X_: batch_xs, y_: batch_ys})
        ls_loss_.append(loss_)
        ls_rms_term.append(step_)

In [None]:
# pretty plot
rolling_plot = pd.Series(ls_loss_).rolling(window=10).mean()
plt.plot(rolling_plot)
plt.xlabel("epochs")
plt.ylabel("loss")
plt.title("rmsprop loss")