In [1]:
import tensorflow as tf
import numpy as np
from numpy import linalg as LA
import numpy as np

In [2]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [3]:
# Inputs and outputs

x = tf.placeholder(tf.float32, shape=[784, None])
y_true = tf.placeholder(tf.float32, shape=[10, None])

In [4]:
# Extracting Useful features

# Weight and bias definition
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

# Convolution and max pooling definitions
def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

# Re-shape image in the required format # 28 x 28 x 1
x_image = tf.reshape(x, [-1, 28, 28, 1])

# First Conv + Max-pool layer # 28 x 28 x 32, # 14 x 14 x 32
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

# Second Conv + Max-pool layer # 14 x 14 x 64, # 7 x 7 x 64
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

# Final fully connected layer to get good features # input: 1 x (7x7x64), output: 1 x 1024
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# Drop-out for the final layer
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

In [5]:
# Final Read-out layer and prediction
W = weight_variable([10, 1024])
b = bias_variable([10,1])
y_pred = tf.matmul(W, tf.transpose(h_fc1_drop)) + b
y_pred.get_shape()

TensorShape([Dimension(10), Dimension(None)])

In [6]:
def tf_frobenius_norm(M):
    return tf.reduce_sum(M ** 2) ** 0.5

# to implement nuclear norm
def tf_nuclear_norm(M):
    st, ut, vt = tf.svd(M,  full_matrices = False)
    #st2 = tf.diag(st)
    #st_r = tf.matmul(ut, tf.matmul(st2, tf.transpose(vt)))
    #print('vish', ut.shape, st2.shape, tf.transpose(vt).shape, st_r.shape)
    
    uk = tf.reshape(ut[:, 0], [10, 1])
    vk = tf.reshape(vt[:, 0], [1, 784])
    sk = tf.matmul(uk, vk)
    #sk = st[0] * sk
    #print(st.shape, ut.shape, vt.shape)
    #print('before', type(sk), sk.shape)
    return sk, _, _, _

# def tf_nuclear_norm_pm(M):
#     un = np.zeros([10, 1])
#     un[0][0] = 1
#     u = tf.constant(un)
#     vn = np.zeros([1, 784])
#     vn[0][0] = 1
#     v = tf.constant(vn)
#     for _ in range(20): 
#         u = tf.matmul(M, x)
#         v = tf.

def Sgdnm(grad, wt):
    return (grad / tf_frobenius_norm(grad))

def Cgd_Fn(grad, wt):
    return ((1 - alpha ) / alpha) * (wt + lam1 * grad / tf_frobenius_norm(grad))

def Cgd_Nn(grad, wt):
    nn, st, st_r, M = tf_nuclear_norm(grad)
    return ((1 - alpha ) / alpha) * (wt - lam2 * nn), st, st_r, M


In [21]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred))

# hyper-parameters
# alpha = tf.placeholder_with_default(tf.constant(0.5), tf.constant(0.001).shape)
global_step = tf.Variable(0, trainable=False)
start_train = 0.00001 # Requires very high lambda for Cgd_Fn
# k=1, start_train = 1, decay_rate = 1 ---> 1/t learning rate
k = 1
alpha = tf.train.inverse_time_decay(start_train, global_step, k, 0)
lam1 = tf.placeholder_with_default(tf.constant(4.0), tf.constant(10000.0).shape)
lam2 = tf.placeholder_with_default(tf.constant(4.0), tf.constant(100.0).shape)

# Gradient Descent optimizer
opt = tf.train.GradientDescentOptimizer(learning_rate = alpha)

# Compute the gradients for a list of variables.
grads_and_vars = opt.compute_gradients(loss)

# SGD update
gv_sgd = [(gv[0], gv[1]) for gv in grads_and_vars]
optimizer_gv_sgd = opt.apply_gradients(gv_sgd, global_step=global_step)
g0_sgd = grads_and_vars[0][0]
w0_sgd = grads_and_vars[0][1]
s0_sgd = w0_sgd
w1_sgd = w0_sgd - alpha * s0_sgd


# Normalized SGD update
gv_nsgd = [(Sgdnm(gv[0], gv[1]), gv[1]) for gv in grads_and_vars]
optimizer_gv_nsgd = opt.apply_gradients(gv_nsgd, global_step=global_step)
g0_nsgd = grads_and_vars[0][0]
w0_nsgd = grads_and_vars[0][1]
s0_nsgd = Sgdnm(g0_nsgd, w0_nsgd)
w1_nsgd = w0_nsgd - alpha * s0_nsgd


# CGD with FN
gv_cgd_fn = [(Cgd_Fn(gv[0], gv[1]), gv[1]) for gv in grads_and_vars]
optimizer_gv_cgd_fn = opt.apply_gradients(gv_cgd_fn, global_step=global_step)
g0_cgd_fn = grads_and_vars[0][0]
w0_cgd_fn = grads_and_vars[0][1]
s0_cgd_fn = Cgd_Fn(g0_cgd_fn, w0_cgd_fn)
w1_cgd_fn = w0_cgd_fn - alpha * s0_cgd_fn

# CGD with NN
# g0_cgd_nn = grads_and_vars[0][0]
# w0_cgd_nn = grads_and_vars[0][1]
# s0_cgd_nn, st, st_r, M = Cgd_Nn(g0_cgd_nn, w0_cgd_nn)
# gv_cgd_nn = [(s0_cgd_nn, gv[1]) for gv in grads_and_vars]
# optimizer_gv_cgd_nn = opt.apply_gradients(gv_cgd_nn, global_step=global_step)
# w1_cgd_nn = w0_cgd_nn - alpha * s0_cgd_nn

correct_prediction = tf.equal(tf.argmax(y_pred, 0), tf.argmax(y_true, 0))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Normalized gradient descent

In [23]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(10):
        batch = mnist.train.next_batch(100)
        feed_dict_keepall = {x: np.transpose(batch[0]), y_true: np.transpose(batch[1]), keep_prob: 1}
        feed_dict_keepsome = {x: np.transpose(batch[0]), y_true: np.transpose(batch[1]), keep_prob: 0.5}
        train_accuracy, loss_, gv_, W_ = sess.run([accuracy, loss, gv_nsgd, W], feed_dict_keepall)
        w1_nsgd_, w0_nsgd_, s0_nsgd_= sess.run([w1_nsgd, w0_nsgd, s0_nsgd], feed_dict_keepall)
        alpha_ = sess.run([alpha], feed_dict)
            
        if i % 1 == 0:
            print('train_accuracy=', train_accuracy, 'loss=', loss_)
            print('NSgd iterates: w(t+1) =', LA.norm(w1_nsgd_), 'w(t) =', LA.norm(w0_nsgd_), 's(t) =', LA.norm(s0_nsgd_))
            print('alpha', alpha_)
        sess.run(optimizer_gv_nsgd, feed_dict_keepsome)
    
    feed_dict={x: np.transpose(mnist.test.images), y_true: np.transpose(mnist.test.labels)}
    test_accuracy = sess.run(accuracy, feed_dict_keepall)
    print('test_accuracy=', test_accuracy)

train_accuracy= 0.14 loss= 64.6559
NSgd iterates: w(t+1) = 2.45905 w(t) = 2.45905 s(t) = 1.0
alpha [9.9999997e-06]
train_accuracy= 0.12 loss= 67.7933
NSgd iterates: w(t+1) = 2.45905 w(t) = 2.45905 s(t) = 1.0
alpha [9.9999997e-06]
train_accuracy= 0.08 loss= 61.91
NSgd iterates: w(t+1) = 2.45905 w(t) = 2.45905 s(t) = 1.0
alpha [9.9999997e-06]
train_accuracy= 0.12 loss= 63.503
NSgd iterates: w(t+1) = 2.45905 w(t) = 2.45905 s(t) = 1.0
alpha [9.9999997e-06]
train_accuracy= 0.1 loss= 68.4093
NSgd iterates: w(t+1) = 2.45905 w(t) = 2.45905 s(t) = 1.0
alpha [9.9999997e-06]
train_accuracy= 0.06 loss= 64.8341
NSgd iterates: w(t+1) = 2.45905 w(t) = 2.45905 s(t) = 1.0
alpha [9.9999997e-06]
train_accuracy= 0.14 loss= 67.0881
NSgd iterates: w(t+1) = 2.45905 w(t) = 2.45905 s(t) = 1.0
alpha [9.9999997e-06]
train_accuracy= 0.09 loss= 67.2228
NSgd iterates: w(t+1) = 2.45905 w(t) = 2.45905 s(t) = 1.0
alpha [9.9999997e-06]
train_accuracy= 0.1 loss= 67.4117
NSgd iterates: w(t+1) = 2.45905 w(t) = 2.45905 s(t

# Frobenius norm

In [None]:
# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     for i in range(1000):
#         batch = mnist.train.next_batch(100)
#         batch1 = batch[:]
#         feed_dict = {x: np.transpose(batch[0]), y_true: np.transpose(batch[1])}
#         train_accuracy, loss_, gv_, W1_ = sess.run([accuracy, loss, gv_cgd_fn, W], feed_dict)
#         w1_cgd_fn_, w0_cgd_fn_, s0_cgd_fn_ = sess.run([w1_cgd_fn, w0_cgd_fn, s0_cgd_fn], feed_dict)
#         alpha_ = sess.run([alpha], feed_dict)
            
#         if i % 100 == 0:
#             print('train_accuracy=', train_accuracy, 'loss value =',loss_)
#             print('frob_nrom of iterates: w(t+1) =', LA.norm(w1_cgd_fn_), 'w(t) =', LA.norm(w0_cgd_fn_), 's(t) =', LA.norm(s0_cgd_fn_))
#             print('alpha', alpha_)
#         sess.run(optimizer_gv_cgd_fn, feed_dict)
    
#     feed_dict={x: np.transpose(mnist.test.images), y_true: np.transpose(mnist.test.labels)}
#     test_accuracy = sess.run(accuracy, feed_dict)
#     print('test_accuracy', test_accuracy)

# Nuclear Norm

In [None]:
# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     for i in range(100):
#         batch = mnist.train.next_batch(100)
#         batch2 = batch[:]
#         feed_dict = {x: np.transpose(batch[0]), y_true: np.transpose(batch[1])}
#         train_accuracy, loss_, gv_, W2_ = sess.run([accuracy, loss, gv_cgd_nn, W], feed_dict)
#         w1_cgd_nn_, w0_cgd_nn_, s0_cgd_nn_, st_, st_r_, M_ = sess.run([w1_cgd_nn, w0_cgd_nn, s0_cgd_nn, st, st_r, M], feed_dict)
#         alpha_ = sess.run([alpha], feed_dict)
            
#         if i % 1 == 0:
#             print('train_accuracy=', train_accuracy, 'loss value =',loss_)
#             print('nuclear_norm of iterates: w(t+1) =', LA.norm(w1_cgd_nn_), 'w(t) =', LA.norm(w0_cgd_nn_), 's(t) =', LA.norm(s0_cgd_nn_))
#             print('alpha', alpha_)
#         sess.run(optimizer_gv_cgd_nn, feed_dict)
    
#     feed_dict={x: np.transpose(mnist.test.images), y_true: np.transpose(mnist.test.labels)}
#     test_accuracy = sess.run(accuracy, feed_dict)
#     print('test_accuracy', test_accuracy)

In [None]:
LA.norm(st_r_)