In [1]:
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.examples.tutorials.mnist import input_data

rng = np.random.RandomState(1234)
random_state = 42

In [6]:
tf.reset_default_graph()

x = tf.placeholder(tf.float32, name='x')
t = tf.placeholder(tf.float32, name='t')

w = tf.Variable(0.0, name='w')
b = tf.Variable(0.0, name='b')

y = w * x + b

cost = tf.reduce_mean((y - t)**2)

gw, gb = tf.gradients(cost, [w, b])
updates = [
    w.assign(w - 0.1 * gw),
    b.assign(b - 0.1 * gb)
]

train = tf.group(*updates)

data_X = np.array([0, 1, 2, 3, 4])
data_y = np.array([3, 5, 7, 9, 11])

sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(100):
    _cost, _ = sess.run([cost, train], feed_dict={x: data_X, t: data_y})
    if (i + 1) % 10 == 0:
        print('step:: %d, cost:: %.3f' % (i + 1, _cost))
        
print('pred_y:', sess.run(y, feed_dict={x:[5]}))

sess.close()

step:: 10, cost:: 0.464
step:: 20, cost:: 0.135
step:: 30, cost:: 0.040
step:: 40, cost:: 0.012
step:: 50, cost:: 0.003
step:: 60, cost:: 0.001
step:: 70, cost:: 0.000
step:: 80, cost:: 0.000
step:: 90, cost:: 0.000
step:: 100, cost:: 0.000
pred_y: [ 13.00327778]


In [253]:
def homework(train_X, train_y, test_X):
    import time
    start_time = time.time()

    IMAGE_SIZE = 784
    CATEGORY_NUM = 10
    ETA = 0.1
    EPSILON = 1e-5
    EPOCHS = 200
    BATCH_SIZE = 30
    LAYER1_UNITS = 80
    LAYER2_UNITS = 50
    LAYER3_UNITS = CATEGORY_NUM

    DROPOUT_LAYER1 = 0.5
    DROPOUT_LAYER2 = 0.8

    TRAIN_DATA_SIZE = len(train_X)

    tf.reset_default_graph()

    with tf.variable_scope('Placeholders'):
        images = tf.placeholder(tf.float32, [None, IMAGE_SIZE], name='image_data')
        labels = tf.placeholder(tf.int64, None, name='label')
        _labels = tf.one_hot(labels, depth=CATEGORY_NUM, on_value=1.0, off_value=0.0, dtype=tf.float32)
        apply_dropout = tf.placeholder(tf.bool, name='apply_dropout')

    with tf.variable_scope('NetworkParams'):
        W1 = tf.Variable(tf.truncated_normal([IMAGE_SIZE, LAYER1_UNITS]), name='W1')
        # b1 = tf.Variable(tf.zeros(LAYER1_UNITS), name='b1')
        W2 = tf.Variable(tf.truncated_normal([LAYER1_UNITS, LAYER2_UNITS]), name='W2')
        # b2 = tf.Variable(tf.zeros(LAYER2_UNITS), name='b2')
        W3 = tf.Variable(tf.truncated_normal([LAYER2_UNITS, LAYER3_UNITS]), name='W2')
        # b3 = tf.Variable(tf.zeros(LAYER3_UNITS), name='b2')
        params = [W1, W2, W3]

    def dropout_apply(val, keep_prob, apply):
        return tf.cond(apply, lambda: tf.nn.dropout(val, keep_prob), lambda: val)

    def batch_normalization(X):
        eps = EPSILON
        output_dim = int(X.get_shape()[-1])
        gamma = tf.Variable(tf.truncated_normal([output_dim], stddev=0.1), name='gamma')
        beta = tf.Variable(tf.zeros([output_dim]), name='beta')
        mean_X, var_X = tf.nn.moments(X, [0])
        return gamma * (X - mean_X) / tf.sqrt(var_X + eps) + beta

    u1 = batch_normalization(tf.matmul(images, W1))
    z1 = tf.nn.relu(u1)
    u2 = batch_normalization(tf.matmul(z1, W2))
    z2 = tf.nn.relu(u2)
    u3 = batch_normalization(tf.matmul(z2, W3))
    y = u3
    # y: [BATCH_SIZE x category_size]

    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=_labels))
    predicted = tf.argmax(y, axis=1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, labels), tf.float32))

    # Update network params
    # gW1, gb1, gW2, gb2, gW3, gb3 = tf.gradients(cost, params)
    gW1, gW2, gW3 = tf.gradients(cost, params)
    updates = [
        W1.assign_add(- ETA * gW1),
        # b1.assign_add(- ETA * gb1),
        W2.assign_add(- ETA * gW2),
        # b2.assign_add(- ETA * gb2),
        W3.assign_add(- ETA * gW3),
        # b3.assign_add(- ETA * gb3),
    ]
    train = tf.group(*updates)

    # Training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(EPOCHS):
            epoch_loss = 0
            epoch_accuracy = 0

            # Train
            step = 0
            while step * BATCH_SIZE < TRAIN_DATA_SIZE:
                start_idx = step * BATCH_SIZE
                end_idx = start_idx + BATCH_SIZE
                if not end_idx < TRAIN_DATA_SIZE:
                    end_idx = TRAIN_DATA_SIZE

                batch_X, batch_y = train_X[start_idx:end_idx, :], train_y[start_idx:end_idx]
                _, c, a = sess.run([train, cost, accuracy],
                                               feed_dict={images: batch_X, labels: batch_y,
                                               apply_dropout: True})
                epoch_loss += c
                epoch_accuracy += a
                step += 1

            if epoch % 100 == 0:
                format_str = "Epoch %d out of %d --- LOSS: %f --- ACCURACY: %f"
                print(format_str % (epoch, EPOCHS, epoch_loss, epoch_accuracy / step))

        # Evaluation
        pred_y = sess.run(predicted, feed_dict={images: test_X, apply_dropout: False})
        
        elapsed_time = (time.time() - start_time) / 60  # min
        print("Elapsed time: %f min" % elapsed_time)
        
        return pred_y


In [248]:
validate_homework()
# score_homework()

Epoch 0 out of 500 --- LOSS: 386.349399 --- ACCURACY: 0.211178
Epoch 100 out of 500 --- LOSS: 114.604620 --- ACCURACY: 0.903493
Epoch 200 out of 500 --- LOSS: 102.879164 --- ACCURACY: 0.933733
Epoch 300 out of 500 --- LOSS: 96.836139 --- ACCURACY: 0.950100
Epoch 400 out of 500 --- LOSS: 92.898814 --- ACCURACY: 0.961876
[1 5 8 9 0 6 6 3 9 5 8 2 0 4 9 0 8 0 1 3 2 0 2 5 7 7 1 1 9 0 2 7 7 1 3 3 2
 5 8 7 9 8 6 8 6 2 4 6 3 4 7 5 1 0 7 2 1 0 0 4 5 3 7 2 1 1 5 8 6 4 4 0 9 8
 4 4 0 9 8 2 9 2 3 5 7 4 0 2 6 2 2 9 7 1 2 7 7 2 3 0]
0.904548107267


In [254]:
validate_homework()

Epoch 0 out of 200 --- LOSS: 385.925522 --- ACCURACY: 0.095110
Epoch 100 out of 200 --- LOSS: 358.551058 --- ACCURACY: 0.519461
Elapsed time: 0.990315 min
[1 2 9 9 8 6 6 1 9 8 8 2 6 6 9 7 9 0 1 6 2 0 2 8 7 2 1 1 9 0 0 7 7 1 9 8 2
 8 9 7 9 9 6 7 6 2 6 6 7 4 7 5 1 8 7 4 1 2 0 9 8 8 7 2 1 1 8 9 6 8 9 6 9 9
 6 9 6 9 8 2 9 2 7 8 7 9 0 2 6 2 2 6 7 1 2 7 7 2 8 6]
0.495843674899


  'precision', 'predicted', average, warn_for)


In [41]:
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

import numpy as np
import tensorflow as tf

# del [
#     tf.app,
#     tf.compat,
#     tf.contrib,
#     tf.errors,
#     tf.gfile,
#     tf.graph_util,
#     tf.image,
#     tf.layers,
#     tf.logging,
#     tf.losses,
#     tf.metrics,
#     tf.python_io,
#     tf.resource_loader,
#     tf.saved_model,
#     tf.sdca,
#     tf.sets,
#     tf.summary,
#     tf.sysconfig,
#     tf.test,
#     tf.train
# ]

def load_mnist():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data.astype('float32'),
                               mnist.target.astype('int32'), random_state=42)

    mnist_X = mnist_X / 255.0

    return train_test_split(mnist_X, mnist_y,
                test_size=0.2,
                random_state=42)

def validate_homework():
    train_X, test_X, train_y, test_y = load_mnist()

    # validate for small dataset
#     train_X_mini = train_X[:100]
#     train_y_mini = train_y[:100]
    train_X_mini = train_X[:2000]
    train_y_mini = train_y[:2000]
    test_X_mini = test_X[:100]
    test_y_mini = test_y[:100]

    pred_y = homework(train_X_mini, train_y_mini, test_X_mini)
    print(test_y_mini)
    print(pred_y)
    print(f1_score(test_y_mini, pred_y, average='macro'))

def score_homework():
    train_X, test_X, train_y, test_y = load_mnist()
    pred_y = homework(train_X, train_y, test_X)
    print(f1_score(test_y, pred_y, average='macro'))

In [232]:
validate_homework()
# score_homework()

Epoch 0 out of 500 --- LOSS: 400.118928 --- ACCURACY: 0.175050
Epoch 100 out of 500 --- LOSS: 117.645703 --- ACCURACY: 0.892016
Epoch 200 out of 500 --- LOSS: 105.000771 --- ACCURACY: 0.923653
Epoch 300 out of 500 --- LOSS: 98.541256 --- ACCURACY: 0.945210
Epoch 400 out of 500 --- LOSS: 94.329387 --- ACCURACY: 0.959880
[1 5 8 9 0 6 6 3 9 5 3 2 0 6 9 0 8 0 1 3 2 0 2 3 7 2 1 1 9 0 0 7 7 1 3 3 2
 5 8 7 9 8 6 7 6 2 4 6 3 4 7 5 1 0 9 4 1 2 0 4 5 3 7 2 1 1 5 8 6 4 9 0 9 8
 4 4 0 9 8 2 9 2 3 5 7 4 0 2 6 2 2 7 7 1 2 7 7 2 3 0]
0.905332252733


In [3]:
train_X, test_X, train_y, test_y = load_mnist()
print(train_X.shape, train_y.shape)    

(56000, 784) (56000,)


In [127]:
train_X[0]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [26]:
def homework(train_X, train_y, test_X):
    import time
    start_time = time.time()

    IMAGE_SIZE = 784
    CATEGORY_NUM = 10
    ETA = 0.15
    EPSILON = 1e-5
    DECAY = 0.5
#     EPOCHS = 500
    EPOCHS = 200
    BATCH_SIZE = 30
    LAYER1_UNITS = 100
    LAYER2_UNITS = 100
    LAYER3_UNITS = CATEGORY_NUM

    TRAIN_DATA_SIZE = len(train_X)

    tf.reset_default_graph()

    # todo(matthew): Add operation to calculate data ave and var before test evaluation

    with tf.variable_scope('Placeholders'):
        images = tf.placeholder(tf.float32, [None, IMAGE_SIZE], name='image_data')
        labels = tf.placeholder(tf.int64, None, name='label')
        _labels = tf.one_hot(labels, depth=CATEGORY_NUM, on_value=1.0, off_value=0.0, dtype=tf.float32)
        is_train = tf.placeholder(tf.bool, name='is_train')

    with tf.variable_scope('NetworkParams'):
        W1 = tf.Variable(tf.truncated_normal([IMAGE_SIZE, LAYER1_UNITS]), name='W1')
        W2 = tf.Variable(tf.truncated_normal([LAYER1_UNITS, LAYER2_UNITS]), name='W2')
        W3 = tf.Variable(tf.truncated_normal([LAYER2_UNITS, LAYER3_UNITS]), name='W3')
        b3 = tf.Variable(tf.zeros(LAYER3_UNITS), name='b3')
        # params = [W1, W2, W3, b3]

    def batch_normalization(X, is_train):
        eps = EPSILON
        output_dim = int(X.shape[-1])

        gamma = tf.Variable(tf.truncated_normal([output_dim], stddev=0.1, mean=1.0), name='gamma')
        beta = tf.Variable(tf.zeros([output_dim]), name='beta')
        mean_X, var_X = tf.nn.moments(X, [0])

        pop_mean = tf.Variable(tf.zeros([output_dim]), trainable=False)
        pop_var = tf.Variable(tf.ones([output_dim]), trainable=False)

        ema = tf.train.ExponentialMovingAverage(decay=0.5)

        def sample():
            ema_apply_op = ema.apply([mean_X, var_X])
            with tf.control_dependencies([ema_apply_op]):
                batch_size = tf.to_float(tf.shape(X)[0])
                unbiased_var_X = batch_size / (batch_size - 1) * var_X
                return tf.nn.batch_normalization(X, mean_X, var_X, beta, gamma, eps)
            # Save exponential average
            # pop_mean.assign(DECAY * pop_mean + (1 - DECAY) * mean_X)
            # pop_var.assign(DECAY * pop_var + (1 - DECAY) * var_X)

        def population():
            train_mean = pop_mean.assign(ema.average(mean_X))
            train_var = pop_mean.assign(ema.average(var_X))
            return tf.nn.batch_normalization(X, train_mean, train_var, beta, gamma, eps)

        return tf.cond(is_train, lambda: sample(), lambda: population())

    u1 = batch_normalization(tf.matmul(images, W1), is_train)
    z1 = tf.nn.relu(u1)
    u2 = batch_normalization(tf.matmul(z1, W2), is_train)
    z2 = tf.nn.relu(u2)
    u3 = tf.matmul(z2, W3) + b3
    y = u3
    # y: [BATCH_SIZE x category_size]

    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=_labels))
    predicted = tf.argmax(y, axis=1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, labels), tf.float32))

    # Update network params
    params = tf.trainable_variables()
    grads = tf.gradients(cost, params)
    updates = [v.assign_add(- ETA * gv) for v, gv in zip(params, grads)]
    # updates = [
    #     W1.assign_add(- ETA * gW1),
    #     W2.assign_add(- ETA * gW2),
    #     W3.assign_add(- ETA * gW3),
    #     b3.assign_add(- ETA * gb3),
    # ]
    train = tf.group(*updates)

    # Training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(EPOCHS):
            epoch_loss = 0
            epoch_accuracy = 0

            # Train
            step = 0
            while step * BATCH_SIZE < TRAIN_DATA_SIZE:
                start_idx = step * BATCH_SIZE
                end_idx = start_idx + BATCH_SIZE
                if not end_idx < TRAIN_DATA_SIZE:
                    end_idx = TRAIN_DATA_SIZE

                batch_X, batch_y = train_X[start_idx:end_idx, :], train_y[start_idx:end_idx]
                _, c, a = sess.run([train, cost, accuracy],
                                   feed_dict={images: batch_X, labels: batch_y,
                                              is_train: True})
                epoch_loss += c
                epoch_accuracy += a
                step += 1

            if epoch % 100 == 0:
                format_str = "Epoch %d out of %d --- LOSS: %f --- ACCURACY: %f"
                print(format_str % (epoch, EPOCHS, epoch_loss, epoch_accuracy / step))

        # Evaluation
        pred_y = sess.run(predicted, feed_dict={images: test_X, is_train: False})
        elapsed_time = (time.time() - start_time) / 60  # min
        print("Elapsed time: %f min" % elapsed_time)
        return pred_y


In [27]:
# Unbiased
validate_homework()

Epoch 0 out of 200 --- LOSS: 318.482763 --- ACCURACY: 0.548303
Epoch 100 out of 200 --- LOSS: 0.365093 --- ACCURACY: 1.000000
Elapsed time: 1.688748 min
[7 5 8 9 0 6 6 3 9 5 8 2 0 6 9 0 8 0 1 3 2 0 2 5 7 2 1 1 9 0 5 7 7 1 3 3 2
 5 8 7 9 8 6 9 6 2 4 6 3 4 7 5 1 0 9 2 1 2 0 4 5 3 7 2 1 1 5 8 6 4 4 0 9 8
 4 4 0 9 8 2 9 2 3 8 7 4 0 2 6 2 2 7 7 1 2 7 2 2 3 0]
[9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
0.0181818181818


  'precision', 'predicted', average, warn_for)


In [20]:
validate_homework()

NameError: name 'mean_X' is not defined

In [324]:
validate_homework()

Epoch 0 out of 500 --- LOSS: 247.373713 --- ACCURACY: 0.168000
Epoch 100 out of 500 --- LOSS: 66.540016 --- ACCURACY: 0.928667
Epoch 200 out of 500 --- LOSS: 57.991087 --- ACCURACY: 0.965333
Epoch 300 out of 500 --- LOSS: 53.752415 --- ACCURACY: 0.983333
Epoch 400 out of 500 --- LOSS: 51.137289 --- ACCURACY: 0.989667
Elapsed time: 2.209841 min
[1 5 1 1 0 6 6 3 1 5 1 5 0 6 4 0 1 0 1 3 2 0 2 1 7 7 1 1 1 0 0 7 7 1 5 3 2
 5 4 7 9 1 6 7 6 1 4 6 5 4 7 5 1 0 5 5 1 2 0 4 5 1 7 2 1 1 1 6 6 4 1 0 1 1
 4 4 0 9 1 4 9 2 3 5 1 9 0 1 6 2 1 7 1 1 2 7 7 2 3 0]
0.622444144091


  'precision', 'predicted', average, warn_for)


In [13]:
validate_homework()

Epoch 0 out of 300 --- LOSS: 444.877840 --- ACCURACY: 0.524751
Epoch 100 out of 300 --- LOSS: 1.008190 --- ACCURACY: 1.000000
Epoch 200 out of 300 --- LOSS: 0.407033 --- ACCURACY: 1.000000
Elapsed time: 1.880607 min
[7 5 8 9 0 6 6 3 9 5 8 2 0 6 9 0 8 0 1 3 2 0 2 5 7 2 1 1 9 0 5 7 7 1 3 3 2
 5 8 7 9 8 6 9 6 2 4 6 3 4 7 5 1 0 9 2 1 2 0 4 5 3 7 2 1 1 5 8 6 4 4 0 9 8
 4 4 0 9 8 2 9 2 3 8 7 4 0 2 6 2 2 7 7 1 2 7 2 2 3 0]
[2 8 2 9 0 6 8 3 9 5 8 2 0 6 9 0 8 0 2 2 2 0 2 8 7 2 2 2 9 0 1 2 7 2 3 3 2
 5 2 7 9 8 6 2 6 2 7 6 8 8 7 5 2 0 7 2 2 2 0 9 5 3 7 2 2 2 8 8 6 4 2 0 8 2
 8 4 0 9 8 2 9 2 3 8 7 9 0 2 6 2 2 7 7 2 2 7 2 2 2 0]
0.649983997696


In [30]:
def homework(train_X, train_y, test_X):
    import time
    start_time = time.time()

    IMAGE_SIZE = 784
    CATEGORY_NUM = 10
    ETA = 0.15
    EPSILON = 1e-5
    DECAY = 0.99
    EPOCHS = 600
#     EPOCHS = 201
    BATCH_SIZE = 30
    LAYER1_UNITS = 100
    LAYER2_UNITS = 100
    LAYER3_UNITS = CATEGORY_NUM

    TRAIN_DATA_SIZE = len(train_X)

    tf.reset_default_graph()

    # todo(matthew): Add operation to calculate data ave and var before test evaluation

    with tf.variable_scope('Placeholders'):
        images = tf.placeholder(tf.float32, [None, IMAGE_SIZE], name='image_data')
        labels = tf.placeholder(tf.int64, None, name='label')
        _labels = tf.one_hot(labels, depth=CATEGORY_NUM, on_value=1.0, off_value=0.0, dtype=tf.float32)
        is_train = tf.placeholder(tf.bool, name='is_train')

    with tf.variable_scope('NetworkParams'):
        W1 = tf.Variable(tf.truncated_normal([IMAGE_SIZE, LAYER1_UNITS]), name='W1')
        W2 = tf.Variable(tf.truncated_normal([LAYER1_UNITS, LAYER2_UNITS]), name='W2')
        W3 = tf.Variable(tf.truncated_normal([LAYER2_UNITS, LAYER3_UNITS]), name='W3')
        b3 = tf.Variable(tf.zeros(LAYER3_UNITS), name='b3')
        # params = [W1, W2, W3, b3]

    def batch_normalization(X, is_train):
        eps = EPSILON
        output_dim = int(X.shape[-1])

        gamma = tf.Variable(tf.truncated_normal([output_dim], stddev=0.1, mean=1.0), name='gamma')
        beta = tf.Variable(tf.zeros([output_dim]), name='beta')

        pop_mean = tf.Variable(tf.zeros([output_dim]), trainable=False)
        pop_var = tf.Variable(tf.ones([output_dim]), trainable=False)

        def sample():
            mean_X, var_X = tf.nn.moments(X, [0])
            # Save exponential average
            moving_avg_op = tf.group(
                pop_mean.assign(DECAY * pop_mean + (1 - DECAY) * mean_X),
                pop_var.assign(DECAY * pop_var + (1 - DECAY) * var_X),
            )

            with tf.control_dependencies([moving_avg_op]):
                batch_size = tf.to_float(tf.shape(X)[0])
                unbiased_var_X = batch_size / (batch_size - 1) * var_X
                return tf.nn.batch_normalization(X, mean_X, var_X, beta, gamma, eps)

        def population():
            return tf.nn.batch_normalization(X, pop_mean, pop_var, beta, gamma, eps)

        return tf.cond(is_train, lambda: sample(), lambda: population())

    def adam_optimizer():
        alpha = 0.001
        beta1 = 0.9
        beta2 = 0.999
        eps = 1e-8

        delta = 1.0

        return delta

    u1 = batch_normalization(tf.matmul(images, W1), is_train)
    z1 = tf.nn.relu(u1)
    u2 = batch_normalization(tf.matmul(z1, W2), is_train)
    z2 = tf.nn.relu(u2)
    u3 = tf.matmul(z2, W3) + b3
    y = u3
    # y: [BATCH_SIZE x category_size]

    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=_labels))
    predicted = tf.argmax(y, axis=1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, labels), tf.float32))

    # Update network params
    params = tf.trainable_variables()
    grads = tf.gradients(cost, params)
    updates = [v.assign_add(- ETA * gv) for v, gv in zip(params, grads)]
    train = tf.group(*updates)

    # Training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(EPOCHS):
            epoch_loss = 0
            epoch_accuracy = 0

            # Train
            step = 0
            while step * BATCH_SIZE < TRAIN_DATA_SIZE:
                start_idx = step * BATCH_SIZE
                end_idx = start_idx + BATCH_SIZE
                if not end_idx < TRAIN_DATA_SIZE:
                    end_idx = TRAIN_DATA_SIZE

                batch_X, batch_y = train_X[start_idx:end_idx, :], train_y[start_idx:end_idx]
                _, c, a = sess.run([train, cost, accuracy],
                                   feed_dict={images: batch_X, labels: batch_y,
                                              is_train: True})
                epoch_loss += c
                epoch_accuracy += a
                step += 1

            if epoch % 100 == 0:
                format_str = "Epoch %d out of %d --- LOSS: %f --- ACCURACY: %f"
                print(format_str % (epoch, EPOCHS, epoch_loss, epoch_accuracy / step))

        # Evaluation
        pred_y = sess.run(predicted, feed_dict={images: test_X, is_train: False})
        elapsed_time = (time.time() - start_time) / 60  # min
        print("Elapsed time: %f min" % elapsed_time)
        return pred_y


In [31]:
validate_homework()

Epoch 0 out of 201 --- LOSS: 331.460501 --- ACCURACY: 0.545110
Epoch 100 out of 201 --- LOSS: 0.426021 --- ACCURACY: 1.000000
Epoch 200 out of 201 --- LOSS: 0.137046 --- ACCURACY: 1.000000
Elapsed time: 1.545775 min
[7 5 8 9 0 6 6 3 9 5 8 2 0 6 9 0 8 0 1 3 2 0 2 5 7 2 1 1 9 0 5 7 7 1 3 3 2
 5 8 7 9 8 6 9 6 2 4 6 3 4 7 5 1 0 9 2 1 2 0 4 5 3 7 2 1 1 5 8 6 4 4 0 9 8
 4 4 0 9 8 2 9 2 3 8 7 4 0 2 6 2 2 7 7 1 2 7 2 2 3 0]
[1 5 8 9 0 6 6 3 9 5 3 2 0 6 9 0 8 0 1 3 2 0 2 5 7 2 1 1 9 0 3 7 7 1 7 3 2
 5 8 7 9 8 6 7 6 2 4 6 8 4 7 5 1 0 7 2 1 2 0 4 3 3 7 2 1 1 5 4 6 4 9 0 9 8
 4 4 9 9 8 2 9 2 3 8 7 4 0 2 6 2 2 7 7 1 2 7 7 2 3 0]
0.873514175173


In [7]:
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.util import nest


def _var_key(var):
    return (var.op.graph, var.op.name)


def _valid_dtypes():
    # Valid types for loss, variables, and gradients
    return set([tf.float16, tf.float32, tf.float64])


def _assert_valid_dtypes(self, tensors):
    valid_dtypes = self._valid_dtypes()
    for t in tensors:
        dtype = t.dtype.base_dtype
        if dtype not in valid_dtypes:
            raise ValueError("Invalid type %r for %s, expected: %s"
                             % (dtype, t.name, [v for v in valid_dtypes]))


class EveOptimizer:
    def __init__(self, learning_rate=.001, beta1=0.1, beta2=0.999, beta3=0.999, epsilon=1e-8,
                 k=.1, K=10, name="Eve"):
        self._lr = learning_rate
        self._beta1 = beta1
        self._beta2 = beta2
        self._beta3 = beta3
        self._epsilon = epsilon
        self._k = k
        self._K = K

        self._delte = None
        self._Delte = None
        self._beta1_power = None
        self._beta2_power = None
        # self._f_1_hat = None
        self._f_2_hat = None
        self._step = None

        # {slot1: {var1: xxx, var2: xxx, ...}, slot2: {var1: xxx, var2: xxx, ...}, ...}
        self._slots = {}

    def _get_beta_accumulators(self):
        return self._beta1_power, self._beta2_power

    def _accumulate_beta(self):
        op1 = tf.assign(self._beta1_power, self._beta1_power * self._beta1)
        op2 = tf.assign(self._beta1_power, self._beta1_power * self._beta1)
        return tf.groupt([op1, op2])

    def _create_algo_params(self, var_list):
        if (self._beta1_power is None or self._beta1_power.graph is not var_list[0].graph):
            with ops.colocate_with(var_list[0]):
                self._beta1_power = tf.Variable(0., name='beta1_power',
                                                            trainable=False)
                self._beta2_power = tf.Variable(0., name='beta2_power',
                                                            trainable=False)
                self._delta = tf.Variable(self._k + 1., name='delta', trainable=False)
                self._Delta = tf.Variable(self._K + 1., name='Delta', trainable=False)
                # self._f_1_hat = variable_scope.variable(0, name='f_1_hat', trainable=False)
                self._f_2_hat = tf.Variable(0., name='f_2_hat', trainable=False)
                self._step = tf.Variable(0, name='step', trainable=False)

            for v in var_list:
                self._create_slot(v, 0., "m")
                self._create_slot(v, 0., "v")
                self._create_slot(v, 1., "d")

    def _create_slot(self, var, initializer, slot_name):
        if slot_name not in self._slots.keys():
            self._slots[slot_name] = {}
        named_slot = self._slots[slot_name]

        with tf.variable_scope(slot_name):
            self.named_slot[_var_key(var)] = tf.Variable(initializer, name=var.op.name, trainable=False)

    def _get_slot(self, var, slot_name):
        named_slot = self._slots[slot_name]
        return named_slot[_var_key(var)]

    def _compute_gradients(self, cost, var_list=None):
        _assert_valid_dtypes([loss])
        if var_list is None:
            var_list = tf.trainable_variables()
        else:
            nest.flatten(var_list)
        var_list += tf.get_collection(tf.GraphKeys._STREAMING_MODEL_PORTS)
        grads = tf.gradients(cost, var_list)
        grads_and_vars = list(zip(grads, var_list))
        return grads_and_vars

    def minimize(self, cost, var_list=None):
        self._create_algo_params(var_list)

        # [(g1, v1), (g2, v2), ...]
        grads_and_vars = self._compute_gradients(cost, var_list)

        beta_acc_op = self._accumulate_beta()
        updates = []

        with tf.control_dependencies([bata_acc_op]):
            beta1_pow, beta2_pow = self._get_beta_accumulators()
            for g, var in grads_and_vars:
                m = self._get_slot(v, "m")
                op_mt = tf.assign(m, self._beta1 * m + (1 - self._beta1) * g)
                with tf.control_dependencies([op_mt]):
                    m_t_hat = m / (1 - beta1_pow)

                v = self._get_slot(v, "v")
                op_vt = tf.assign(v, self._beta2 * v + (1 - self._beta2) * g * g)
                with tf.control_dependencies([op_vt]):
                    v_t_hat = v / (1 - beta2_pow)

                d = self._get_slot(v, "d")

                def first_step():
                    op1 = tf.assign(self._f_2_hat, cost)
                    op2 = tf.assign(d, 1.)
                    return tf.group([op1, op2])

                def other_step():
                    delte, Delte = tf.cond(cost > self._f_2_hat, lambda: self.k + 1, self.K + 1,
                                           lambda: 1. / (self.K + 1), 1. / (self.k + 1))
                    c_t = tf.minimum(tf.maximum(delte, cost / self._f_2_hat), Delte)
                    f_1_hat = c_t * self._f_2_hat
                    r_t = tf.abs(f_1_hat - self._f_2_hat) / tf.minimum(f_1_hat, self._f_2_hat)

                    op1 = tf.assign(d, self._beta3 * d + (1 - self._beta3) * r_t)
                    op2 = tf.assign(self._f_2_hat, f_1_hat)
                    return tf.group([op1, op2])

                update_algo_params = tf.cond(self._step > 0, other_step, first_step)
                with control_dependencies([update_algo_params]):
                    update_param = var.assign_add(
                        - self._lr * m_t_hat / (d * tf.sqrt(v_t_hat) + self._epsilon))

                updates.append(update_param)
                updates.append(tf.assign(m, m_t))
                updates.append(tf.assign(v, v_t))

            update_step = self._step.assign_add(1)
            updates.append(update_step)
            return tf.group(updates)


In [58]:
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

import numpy as np
import tensorflow as tf

# del [
#     tf.app,
#     tf.compat,
#     tf.contrib,
#     tf.errors,
#     tf.gfile,
#     tf.graph_util,
#     tf.image,
#     tf.layers,
#     tf.logging,
#     tf.losses,
#     tf.metrics,
#     tf.python_io,
#     tf.resource_loader,
#     tf.saved_model,
#     tf.sdca,
#     tf.sets,
#     tf.summary,
#     tf.sysconfig,
#     tf.test,
#     tf.train
# ]

def load_mnist():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data.astype('float32'),
                               mnist.target.astype('int32'), random_state=42)

    mnist_X = mnist_X / 255.0

    return train_test_split(mnist_X, mnist_y,
                test_size=0.2,
                random_state=42)

def validate_homework():
    train_X, test_X, train_y, test_y = load_mnist()

    # validate for small dataset
#     train_X_mini = train_X[:100]
#     train_y_mini = train_y[:100]
    train_X_mini = train_X[:2000]
    train_y_mini = train_y[:2000]
    test_X_mini = test_X[:100]
    test_y_mini = test_y[:100]

    pred_y = homework(train_X_mini, train_y_mini, test_X_mini)
    print(test_y_mini)
    print(pred_y)
    print(f1_score(test_y_mini, pred_y, average='macro'))

def score_homework():
    train_X, test_X, train_y, test_y = load_mnist()
    pred_y = homework(train_X, train_y, test_X)
    print(f1_score(test_y, pred_y, average='macro'))

In [59]:
def homework(train_X, train_y, test_X):
    import time
    start_time = time.time()

    IMAGE_SIZE = 784
    CATEGORY_NUM = 10
    ETA = 0.15
    EPSILON = 1e-5
    DECAY = 0.99
    EPOCHS = 600
#     EPOCHS = 201
    BATCH_SIZE = 30
    LAYER1_UNITS = 100
    LAYER2_UNITS = 100
    LAYER3_UNITS = CATEGORY_NUM

    TRAIN_DATA_SIZE = len(train_X)

    tf.reset_default_graph()

    # todo(matthew): Add operation to calculate data ave and var before test evaluation

    with tf.variable_scope('Placeholders'):
        images = tf.placeholder(tf.float32, [None, IMAGE_SIZE], name='image_data')
        labels = tf.placeholder(tf.int64, None, name='label')
        _labels = tf.one_hot(labels, depth=CATEGORY_NUM, on_value=1.0, off_value=0.0, dtype=tf.float32)
        is_train = tf.placeholder(tf.bool, name='is_train')

    with tf.variable_scope('NetworkParams'):
        W1 = tf.Variable(tf.truncated_normal([IMAGE_SIZE, LAYER1_UNITS]), name='W1')
        W2 = tf.Variable(tf.truncated_normal([LAYER1_UNITS, LAYER2_UNITS]), name='W2')
        W3 = tf.Variable(tf.truncated_normal([LAYER2_UNITS, LAYER3_UNITS]), name='W3')
        b3 = tf.Variable(tf.zeros(LAYER3_UNITS), name='b3')
        # params = [W1, W2, W3, b3]

    def batch_normalization(X, is_train):
        eps = EPSILON
        output_dim = int(X.shape[-1])

        gamma = tf.Variable(tf.truncated_normal([output_dim], stddev=0.1, mean=1.0), name='gamma')
        beta = tf.Variable(tf.zeros([output_dim]), name='beta')

        pop_mean = tf.Variable(tf.zeros([output_dim]), trainable=False)
        pop_var = tf.Variable(tf.ones([output_dim]), trainable=False)

        def sample():
            mean_X, var_X = tf.nn.moments(X, [0])
            # Save exponential average
            moving_avg_op = tf.group(
                pop_mean.assign(DECAY * pop_mean + (1 - DECAY) * mean_X),
                pop_var.assign(DECAY * pop_var + (1 - DECAY) * var_X),
            )

            with tf.control_dependencies([moving_avg_op]):
                batch_size = tf.to_float(tf.shape(X)[0])
                unbiased_var_X = batch_size / (batch_size - 1) * var_X
                return tf.nn.batch_normalization(X, mean_X, var_X, beta, gamma, eps)

        def population():
            return tf.nn.batch_normalization(X, pop_mean, pop_var, beta, gamma, eps)

        return tf.cond(is_train, lambda: sample(), lambda: population())


    u1 = batch_normalization(tf.matmul(images, W1), is_train)
    z1 = tf.nn.relu(u1)
    u2 = batch_normalization(tf.matmul(z1, W2), is_train)
    z2 = tf.nn.relu(u2)
    u3 = tf.matmul(z2, W3) + b3
    y = u3
    # y: [BATCH_SIZE x category_size]

    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=_labels))
    predicted = tf.argmax(y, axis=1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, labels), tf.float32))

    # Update network params
    params = tf.trainable_variables()
#     grads = tf.gradients(cost, params)
#     updates = [v.assign_add(- ETA * gv) for v, gv in zip(params, grads)]
#     train = tf.group(*updates)
    train = EveOptimizer().minimize(cost, params)

    # Training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(EPOCHS):
            epoch_loss = 0
            epoch_accuracy = 0

            # Train
            step = 0
            while step * BATCH_SIZE < TRAIN_DATA_SIZE:
                start_idx = step * BATCH_SIZE
                end_idx = start_idx + BATCH_SIZE
                if not end_idx < TRAIN_DATA_SIZE:
                    end_idx = TRAIN_DATA_SIZE

                batch_X, batch_y = train_X[start_idx:end_idx, :], train_y[start_idx:end_idx]
                _, c, a = sess.run([train, cost, accuracy],
                                   feed_dict={images: batch_X, labels: batch_y,
                                              is_train: True})
                epoch_loss += c
                epoch_accuracy += a
                step += 1

            if epoch % 100 == 0:
                format_str = "Epoch %d out of %d --- LOSS: %f --- ACCURACY: %f"
                print(format_str % (epoch, EPOCHS, epoch_loss, epoch_accuracy / step))

        # Evaluation
        pred_y = sess.run(predicted, feed_dict={images: test_X, is_train: False})
        elapsed_time = (time.time() - start_time) / 60  # min
        print("Elapsed time: %f min" % elapsed_time)
        return pred_y


In [9]:
validate_homework()

AttributeError: 'EveOptimizer' object has no attribute 'named_slot'

In [3]:
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

import numpy as np
import tensorflow as tf

# del [
#     tf.app,
#     tf.compat,
#     tf.contrib,
#     tf.errors,
#     tf.gfile,
#     tf.graph_util,
#     tf.image,
#     tf.layers,
#     tf.logging,
#     tf.losses,
#     tf.metrics,
#     tf.python_io,
#     tf.resource_loader,
#     tf.saved_model,
#     tf.sdca,
#     tf.sets,
#     tf.summary,
#     tf.sysconfig,
#     tf.test,
#     tf.train
# ]

def load_mnist():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data.astype('float32'),
                               mnist.target.astype('int32'), random_state=42)

    mnist_X = mnist_X / 255.0

    return train_test_split(mnist_X, mnist_y,
                test_size=0.2,
                random_state=42)

def validate_homework():
    train_X, test_X, train_y, test_y = load_mnist()

    # validate for small dataset
#     train_X_mini = train_X[:100]
#     train_y_mini = train_y[:100]
    train_X_mini = train_X[:5000]
    train_y_mini = train_y[:5000]
    test_X_mini = test_X[:100]
    test_y_mini = test_y[:100]

    pred_y = homework(train_X_mini, train_y_mini, test_X_mini)
    print(test_y_mini)
    print(pred_y)
    print(f1_score(test_y_mini, pred_y, average='macro'))

def score_homework():
    train_X, test_X, train_y, test_y = load_mnist()
    pred_y = homework(train_X, train_y, test_X)
    print(f1_score(test_y, pred_y, average='macro'))

In [30]:
def homework(train_X, train_y, test_X):
    import time
    start_time = time.time()

    IMAGE_SIZE = 784
    CATEGORY_NUM = 10
    ETA = 0.15
    EPSILON = 1e-5
    DECAY = 0.99

    LAYER1_UNITS = 100
    LAYER2_UNITS = 100
    LAYER3_UNITS = CATEGORY_NUM

    TRAIN_DATA_SIZE = len(train_X)

    tf.reset_default_graph()

    # todo(matthew): Add operation to calculate data ave and var before test evaluation

    with tf.variable_scope('Placeholders'):
        images = tf.placeholder(tf.float32, [None, IMAGE_SIZE], name='image_data')
        labels = tf.placeholder(tf.int64, None, name='label')
        _labels = tf.one_hot(labels, depth=CATEGORY_NUM, on_value=1.0, off_value=0.0, dtype=tf.float32)
        is_train = tf.placeholder(tf.bool, name='is_train')

    with tf.variable_scope('NetworkParams'):
        W1 = tf.Variable(tf.truncated_normal([IMAGE_SIZE, LAYER1_UNITS]), name='W1')
        W2 = tf.Variable(tf.truncated_normal([LAYER1_UNITS, LAYER2_UNITS]), name='W2')
        W3 = tf.Variable(tf.truncated_normal([LAYER2_UNITS, LAYER3_UNITS]), name='W3')
        b3 = tf.Variable(tf.zeros(LAYER3_UNITS), name='b3')
        # params = [W1, W2, W3, b3]

    def batch_normalization(X, is_train):
        eps = EPSILON
        output_dim = int(X.shape[-1])

        gamma = tf.Variable(tf.truncated_normal([output_dim], stddev=0.1, mean=1.0), name='gamma')
        beta = tf.Variable(tf.zeros([output_dim]), name='beta')

        pop_mean = tf.Variable(tf.zeros([output_dim]), trainable=False)
        pop_var = tf.Variable(tf.ones([output_dim]), trainable=False)

        def sample():
            mean_X, var_X = tf.nn.moments(X, [0])
            # Save exponential average
            moving_avg_op = tf.group(
                pop_mean.assign(DECAY * pop_mean + (1 - DECAY) * mean_X),
                pop_var.assign(DECAY * pop_var + (1 - DECAY) * var_X),
            )

            with tf.control_dependencies([moving_avg_op]):
                batch_size = tf.to_float(tf.shape(X)[0])
                unbiased_var_X = batch_size / (batch_size - 1) * var_X
                return tf.nn.batch_normalization(X, mean_X, var_X, beta, gamma, eps)

        def population():
            return tf.nn.batch_normalization(X, pop_mean, pop_var, beta, gamma, eps)

        return tf.cond(is_train, lambda: sample(), lambda: population())


    u1 = batch_normalization(tf.matmul(images, W1), is_train)
    z1 = tf.nn.relu(u1)
    u2 = batch_normalization(tf.matmul(z1, W2), is_train)
    z2 = tf.nn.relu(u2)
    u3 = tf.matmul(z2, W3) + b3
    y = u3
    # y: [BATCH_SIZE x category_size]

    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=_labels))
    predicted = tf.argmax(y, axis=1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, labels), tf.float32))

    # Update network params
    params = tf.trainable_variables()
#     grads = tf.gradients(cost, params)
#     updates = [v.assign_add(- ETA * gv) for v, gv in zip(params, grads)]
#     train = tf.group(*updates)
    train = EveOptimizer().minimize(cost, params)
#     train = tf.train.AdamOptimizer().minimize(cost)

    # Prepare data
    t_X, v_X, t_y, v_y = train_test_split(train_X, train_y)
    
    # Training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        train_summary_writer = tf.summary.FileWriter("./optimizer/train", sess.graph)

        EPOCHS = 200
        BATCH_SIZE = 30
        STEPS_IN_EPOCHS = len(t_X) // BATCH_SIZE

        for epoch in range(EPOCHS):
            # Train
            epoch_loss = 0
            epoch_accuracy = 0
            for step in range(STEPS_IN_EPOCHS):
                start_idx = step * BATCH_SIZE
                end_idx = start_idx + BATCH_SIZE

                batch_X, batch_y = train_X[start_idx:end_idx], train_y[start_idx:end_idx]
                _, c, a = sess.run([train, cost, accuracy],
                                   feed_dict={images: t_X[start_idx:end_idx],
                                              labels: t_y[start_idx:end_idx],
                                              is_train: True})
                epoch_loss += c
                epoch_accuracy += a

            c, p = sess.run([cost, predicted],
                            feed_dict={images: v_X, labels: v_y, is_train: False})
            if epoch % 10 == 0:
                format_str = "Epoch %d   --- Train loss: %f   ---Train accuracy: %f   ---Test loss: %f   --- F: %f"
                print(format_str % (epoch, epoch_loss, epoch_accuracy / step, c, f1_score(v_y, p, average='macro')))
                
        train_summary_writer.close()

        # Evaluation
        pred_y = sess.run(predicted, feed_dict={images: test_X, is_train: False})
        elapsed_time = (time.time() - start_time) / 60  # min
        print("Elapsed time: %f min" % elapsed_time)
        return pred_y


In [69]:
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.util import nest


def _var_key(var):
    return (var.op.graph, var.op.name)


def _valid_dtypes():
    # Valid types for loss, variables, and gradients
    return set([tf.float16, tf.float32, tf.float64])


def _assert_valid_dtypes(tensors):
    valid_dtypes = _valid_dtypes()
    for t in tensors:
        dtype = t.dtype.base_dtype
        if dtype not in valid_dtypes:
            raise ValueError("Invalid type %r for %s, expected: %s"
                             % (dtype, t.name, [v for v in valid_dtypes]))


class EveOptimizer:
    def __init__(self, learning_rate=.001, beta1=0.1, beta2=0.999, beta3=0.999, epsilon=1e-8,
                 k=.1, K=10, name="Eve"):
        self._lr = learning_rate
        self._beta1 = beta1
        self._beta2 = beta2
        self._beta3 = beta3
        self._epsilon = epsilon
        self._k = k
        self._K = K

        self._delte = None
        self._Delte = None
        self._beta1_power = None
        self._beta2_power = None
        # self._f_1_hat = None
        self._f_2_hat = None
        self._step = None

        # {slot1: {var1: xxx, var2: xxx, ...}, slot2: {var1: xxx, var2: xxx, ...}, ...}
        self._slots = {}

    def _get_beta_accumulators(self):
        return self._beta1_power, self._beta2_power

    def _accumulate_beta(self):
        op1 = tf.assign(self._beta1_power, self._beta1_power * self._beta1)
        op2 = tf.assign(self._beta1_power, self._beta1_power * self._beta1)
        return tf.group(op1, op2)

    def _create_algo_params(self, var_list):
        if (self._beta1_power is None or self._beta1_power.graph is not var_list[0].graph):
            with ops.colocate_with(var_list[0]):
                self._beta1_power = tf.Variable(0., name='beta1_power',
                                                trainable=False)
                self._beta2_power = tf.Variable(0., name='beta2_power',
                                                trainable=False)
                self._delta = tf.Variable(self._k + 1., name='delta', trainable=False)
                self._Delta = tf.Variable(self._K + 1., name='Delta', trainable=False)
                # self._f_1_hat = variable_scope.variable(0, name='f_1_hat', trainable=False)
                self._f_2_hat = tf.Variable(0., name='f_2_hat', trainable=False)
                self._step = tf.Variable(0, name='step', trainable=False)

            for v in var_list:
                self._create_slot(v, tf.zeros(v.shape), "m")
                self._create_slot(v, tf.zeros(v.shape), "v")
                self._create_slot(v, tf.ones(v.shape), "d")

    def _create_slot(self, var, initializer, slot_name):
        if slot_name not in self._slots.keys():
            self._slots[slot_name] = {}
        named_slot = self._slots[slot_name]

        with tf.variable_scope(slot_name):
            named_slot[_var_key(var)] = tf.Variable(initializer, name=var.op.name, trainable=False)

    def _get_slot(self, var, slot_name):
        named_slot = self._slots[slot_name]
        return named_slot[_var_key(var)]

    def _compute_gradients(self, cost, var_list=None):
        _assert_valid_dtypes([cost])
        if var_list is None:
            var_list = tf.trainable_variables()
        else:
            nest.flatten(var_list)
#         var_list += tf.get_collection(tf.GraphKeys._STREAMING_MODEL_PORTS)
        grads = tf.gradients(cost, var_list)
        grads_and_vars = list(zip(grads, var_list))
        return grads_and_vars

    def minimize(self, cost, var_list=None):
        self._create_algo_params(var_list)

        # [(g1, v1), (g2, v2), ...]
        grads_and_vars = self._compute_gradients(cost, var_list)

        beta_acc_op = self._accumulate_beta()
        updates = []

        with tf.control_dependencies([beta_acc_op]):
            beta1_pow, beta2_pow = self._get_beta_accumulators()
            for g, var in grads_and_vars:
                m = self._get_slot(var, "m")
                op_mt = tf.assign(m, self._beta1 * m + (1 - self._beta1) * g)
                with tf.control_dependencies([op_mt]):
                    m_t_hat = m / (1 - beta1_pow)

                v = self._get_slot(var, "v")
                op_vt = tf.assign(v, self._beta2 * v + (1 - self._beta2) * g * g)
                with tf.control_dependencies([op_vt]):
                    v_t_hat = v / (1 - beta2_pow)

                d = self._get_slot(var, "d")

                def first_step():
                    op1 = tf.assign(self._f_2_hat, cost)
                    op2 = tf.assign(d, tf.ones(d.shape))
                    return tf.group(op1, op2)

                def other_step():
                    delte, Delte = tf.cond(
                        cost > self._f_2_hat,
                        lambda: (self._k + tf.constant(1.), self._K + tf.constant(1.)),
                        lambda: (tf.constant(1.) / (self._K + tf.constant(1.)), tf.constant(1.) / (self._k + tf.constant(1.))))
                    c_t = tf.minimum(tf.maximum(delte, cost / self._f_2_hat), Delte)
                    f_1_hat = c_t * self._f_2_hat
                    r_t = tf.abs(f_1_hat - self._f_2_hat) / tf.minimum(f_1_hat, self._f_2_hat)

                    op1 = tf.assign(d, self._beta3 * d + (1 - self._beta3) * r_t)
                    op2 = tf.assign(self._f_2_hat, f_1_hat)
                    return tf.group(op1, op2)

                update_algo_params = tf.cond(
                    self._step > tf.constant(0, dtype=tf.int32),
                    other_step, first_step)
                with tf.control_dependencies([update_algo_params]):
                    update_param = var.assign_add(
                        - self._lr * m_t_hat / (d * tf.sqrt(v_t_hat) + self._epsilon))

                updates.append(update_param)

            update_step = self._step.assign_add(1)
            updates.append(update_step)
            return tf.group(*updates)


In [82]:
# With Eve
validate_homework()

Epoch 0   --- Train loss: 182.643246   ---Train accuracy: 0.694086   ---Test loss: 1.102165   --- F: 0.804541
Epoch 10   --- Train loss: 4.772926   ---Train accuracy: 1.005645   ---Test loss: 0.541614   --- F: 0.873027
Epoch 20   --- Train loss: 1.073665   ---Train accuracy: 1.008065   ---Test loss: 0.550403   --- F: 0.882928
Epoch 30   --- Train loss: 0.390026   ---Train accuracy: 1.008065   ---Test loss: 0.564469   --- F: 0.888620
Epoch 40   --- Train loss: 0.161135   ---Train accuracy: 1.008065   ---Test loss: 0.580563   --- F: 0.886828
Epoch 50   --- Train loss: 0.071664   ---Train accuracy: 1.008065   ---Test loss: 0.596380   --- F: 0.890871
Epoch 60   --- Train loss: 0.033050   ---Train accuracy: 1.008065   ---Test loss: 0.615038   --- F: 0.896486
Epoch 70   --- Train loss: 0.015921   ---Train accuracy: 1.008065   ---Test loss: 0.635010   --- F: 0.899031
Epoch 80   --- Train loss: 0.007977   ---Train accuracy: 1.008065   ---Test loss: 0.655579   --- F: 0.899711
Epoch 90   --- Tra

In [84]:
# With Adam
validate_homework()

Epoch 0   --- Train loss: 697.326056   ---Train accuracy: 0.252688   ---Test loss: 4.859257   --- F: 0.389220
Epoch 10   --- Train loss: 29.610290   ---Train accuracy: 0.948387   ---Test loss: 0.715975   --- F: 0.827468
Epoch 20   --- Train loss: 6.256111   ---Train accuracy: 1.003763   ---Test loss: 0.621250   --- F: 0.856757
Epoch 30   --- Train loss: 1.585718   ---Train accuracy: 1.008065   ---Test loss: 0.612522   --- F: 0.867729
Epoch 40   --- Train loss: 0.612943   ---Train accuracy: 1.008065   ---Test loss: 0.618257   --- F: 0.870161
Epoch 50   --- Train loss: 0.270264   ---Train accuracy: 1.008065   ---Test loss: 0.631642   --- F: 0.875146
Epoch 60   --- Train loss: 0.125036   ---Train accuracy: 1.008065   ---Test loss: 0.648230   --- F: 0.879929
Epoch 70   --- Train loss: 0.059453   ---Train accuracy: 1.008065   ---Test loss: 0.667984   --- F: 0.883164
Epoch 80   --- Train loss: 0.028579   ---Train accuracy: 1.008065   ---Test loss: 0.689095   --- F: 0.886307
Epoch 90   --- Tr

In [3]:
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

import numpy as np
import tensorflow as tf

# del [
#     tf.app,
#     tf.compat,
#     tf.contrib,
#     tf.errors,
#     tf.gfile,
#     tf.graph_util,
#     tf.image,
#     tf.layers,
#     tf.logging,
#     tf.losses,
#     tf.metrics,
#     tf.python_io,
#     tf.resource_loader,
#     tf.saved_model,
#     tf.sdca,
#     tf.sets,
#     tf.summary,
#     tf.sysconfig,
#     tf.test,
#     tf.train
# ]

def load_mnist():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data.astype('float32'),
                               mnist.target.astype('int32'), random_state=42)

    mnist_X = mnist_X / 255.0

    return train_test_split(mnist_X, mnist_y,
                test_size=0.2,
                random_state=42)

def validate_homework():
    train_X, test_X, train_y, test_y = load_mnist()

    # validate for small dataset
#     train_X_mini = train_X[:100]
#     train_y_mini = train_y[:100]
    train_X_mini = train_X[:5000]
    train_y_mini = train_y[:5000]
    test_X_mini = test_X[:100]
    test_y_mini = test_y[:100]

    pred_y = homework(train_X_mini, train_y_mini, test_X_mini)
    print(test_y_mini)
    print(pred_y)
    print(f1_score(test_y_mini, pred_y, average='macro'))

def score_homework():
    train_X, test_X, train_y, test_y = load_mnist()
    pred_y = homework(train_X, train_y, test_X)
    print(f1_score(test_y, pred_y, average='macro'))

In [55]:
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.util import nest


def _var_key(var):
    return (var.op.graph, var.op.name)


def _valid_dtypes():
    # Valid types for loss, variables, and gradients
    return set([tf.float16, tf.float32, tf.float64])


def _assert_valid_dtypes(tensors):
    valid_dtypes = _valid_dtypes()
    for t in tensors:
        dtype = t.dtype.base_dtype
        if dtype not in valid_dtypes:
            raise ValueError("Invalid type %r for %s, expected: %s"
                             % (dtype, t.name, [v for v in valid_dtypes]))


class EveOptimizer:
    def __init__(self, learning_rate=.001, beta1=0.9, beta2=0.999, beta3=0.999, epsilon=1e-8,
                 k=.1, K=10, name="Eve"):
        self._lr = learning_rate
        self._beta1 = beta1
        self._beta2 = beta2
        self._beta3 = beta3
        self._epsilon = epsilon
        self._k = k
        self._K = K

        self._delte = None
        self._Delte = None
        self._beta1_power = None
        self._beta2_power = None
        # self._f_1_hat = None
        self._f_2_hat = None
        self._step = None

        # {slot1: {var1: xxx, var2: xxx, ...}, slot2: {var1: xxx, var2: xxx, ...}, ...}
        self._slots = {}

    def _get_beta_accumulators(self):
        return self._beta1_power, self._beta2_power

    def _accumulate_beta(self):
        op1 = tf.assign(self._beta1_power, self._beta1_power * self._beta1)
        op2 = tf.assign(self._beta2_power, self._beta2_power * self._beta2)
        return tf.group(op1, op2)

    def _create_algo_params(self, var_list):
        if (self._beta1_power is None or self._beta1_power.graph is not var_list[0].graph):
            with ops.colocate_with(var_list[0]):
                self._beta1_power = tf.Variable(1., name='beta1_power',
                                                trainable=False)
                self._beta2_power = tf.Variable(1., name='beta2_power',
                                                trainable=False)
                self._f_2_hat = tf.Variable(0., name='f_2_hat', trainable=False)
                self._step = tf.Variable(1, name='step', trainable=False)

            for v in var_list:
                self._create_slot(v, tf.zeros(v.shape), "m")
                self._create_slot(v, tf.zeros(v.shape), "v")
                self._create_slot(v, tf.ones(v.shape), "d")

    def _create_slot(self, var, initializer, slot_name):
        if slot_name not in self._slots.keys():
            self._slots[slot_name] = {}
        named_slot = self._slots[slot_name]

        with tf.variable_scope(slot_name):
            named_slot[_var_key(var)] = tf.Variable(initializer, name=var.op.name, trainable=False)

    def _get_slot(self, var, slot_name):
        named_slot = self._slots[slot_name]
        return named_slot[_var_key(var)]

    def _compute_gradients(self, cost, var_list=None):
        _assert_valid_dtypes([cost])
        if var_list is None:
            var_list = tf.trainable_variables()
        else:
            nest.flatten(var_list)
        grads = tf.gradients(cost, var_list)
        grads_and_vars = list(zip(grads, var_list))
        return grads_and_vars

    def minimize(self, cost, var_list=None):
        self._create_algo_params(var_list)

        # [(g1, v1), (g2, v2), ...]
        grads_and_vars = self._compute_gradients(cost, var_list)

        beta_acc_op = self._accumulate_beta()
        updates = []

        # u1 = self._get_slot(var_list[1], "m")
        # u2 = self._get_slot(var_list[2], "m")
        # u3 = self._get_slot(var_list[3], "m")
        # pr_out = tf.Print(u1, [u1, u2, u3], "outside")
        # updates.append(pr_out)

        with tf.control_dependencies([beta_acc_op]):
            beta1_pow, beta2_pow = self._get_beta_accumulators()
            # updates.append(tf.assert_less_equal(self._step, 7))
            for g, var in grads_and_vars:
                m = self._get_slot(var, "m")

                # pr_before = tf.Print(m, [self._step, beta1_pow, var.op.name, m, self._beta1 * m + (1 - self._beta1) * g], "m before update: ")
                # updates.append(pr_before)

                op_mt = tf.assign(m, self._beta1 * m + (1 - self._beta1) * g)
                with tf.control_dependencies([op_mt]):
                    updates.append(tf.assert_greater(1 - beta1_pow, 0.))
                    updates.append(tf.Print(beta1_pow, [self._step, beta1_pow], "after beta1_pow"))
                    m_t_hat = m / (1 - beta1_pow)
                    # pr_m = tf.Print(m_t_hat, [self._step, beta1_pow, var.op.name, m, m_t_hat], "m after update: ")
                    # updates.append(pr_m)

                v = self._get_slot(var, "v")
                op_vt = tf.assign(v, self._beta2 * v + (1 - self._beta2) * g * g)
                with tf.control_dependencies([op_vt]):
                    updates.append(tf.assert_greater(1 - beta2_pow, 0.))
                    updates.append(tf.Print(beta2_pow, [self._step, beta2_pow], "after beta2_pow"))
                    v_t_hat = v / (1 - beta2_pow)

                d = self._get_slot(var, "d")

                def first_step():
                    op1 = tf.assign(self._f_2_hat, cost)
                    op2 = tf.assign(d, tf.ones(d.shape))
                    return tf.group(op1, op2)

                def other_step():
                    delte, Delte = tf.cond(
                        tf.greater_equal(cost, self._f_2_hat),
                        lambda: (tf.constant(self._k + 1.), tf.constant(self._K + 1.)),
                        lambda: (tf.constant(1. / (self._K + 1.)), tf.constant(1. / (self._k + 1))))
                    c_t = tf.minimum(tf.maximum(delte, cost / self._f_2_hat), Delte)
                    f_1_hat = c_t * self._f_2_hat
                    r_t = tf.abs(f_1_hat - self._f_2_hat) / tf.minimum(f_1_hat, self._f_2_hat)

                    op1 = tf.assign(d, self._beta3 * d + (1 - self._beta3) * r_t)
                    op2 = tf.assign(self._f_2_hat, f_1_hat)
                    return tf.group(op1, op2)

                update_algo_params = tf.cond(
                    tf.greater(self._step, tf.constant(1, dtype=tf.int32)), other_step, first_step)
                with tf.control_dependencies([update_algo_params]):
                    update_param = var.assign_add(
                        - self._lr * m_t_hat / (d * tf.sqrt(v_t_hat) + self._epsilon))

                updates.append(update_param)

            update_step = self._step.assign_add(1)
            pr_step = tf.Print(update_step, [self._step], "pritn step at the last")
            updates.append(update_step)
            updates.append(pr_step)
            return tf.group(*updates)


In [56]:
# With Eve
validate_homework()

Epoch 0   --- Train loss: 729.305761   ---Train accuracy: 0.262097   ---Test loss: 4.289254   --- F: 0.408397
Epoch 10   --- Train loss: 13.529293   ---Train accuracy: 0.989785   ---Test loss: 0.646005   --- F: 0.837990
Epoch 20   --- Train loss: 1.040855   ---Train accuracy: 1.008065   ---Test loss: 0.545028   --- F: 0.862334
Epoch 30   --- Train loss: 0.190975   ---Train accuracy: 1.008065   ---Test loss: 0.520026   --- F: 0.877986
Epoch 40   --- Train loss: 0.039365   ---Train accuracy: 1.008065   ---Test loss: 0.507946   --- F: 0.882167
Epoch 50   --- Train loss: 0.008091   ---Train accuracy: 1.008065   ---Test loss: 0.500289   --- F: 0.885496
Epoch 60   --- Train loss: 0.002499   ---Train accuracy: 1.008065   ---Test loss: 0.507166   --- F: 0.891848
Epoch 70   --- Train loss: 0.000824   ---Train accuracy: 1.008065   ---Test loss: 0.518447   --- F: 0.891984
Epoch 80   --- Train loss: 0.000429   ---Train accuracy: 1.008065   ---Test loss: 0.530042   --- F: 0.895657
Epoch 90   --- Tr

In [2]:
def homework(train_X, train_y, test_X):
    import time
    start_time = time.time()

    IMAGE_SIZE = 784
    CATEGORY_NUM = 10
    ETA = 0.15
    EPSILON = 1e-5
    DECAY = 0.99

    LAYER1_UNITS = 100
    LAYER2_UNITS = 100
    LAYER3_UNITS = CATEGORY_NUM

    TRAIN_DATA_SIZE = len(train_X)

    tf.reset_default_graph()

    # todo(matthew): Add operation to calculate data ave and var before test evaluation

    with tf.variable_scope('Placeholders'):
        images = tf.placeholder(tf.float32, [None, IMAGE_SIZE], name='image_data')
        labels = tf.placeholder(tf.int64, None, name='label')
        _labels = tf.one_hot(labels, depth=CATEGORY_NUM, on_value=1.0, off_value=0.0, dtype=tf.float32)
        is_train = tf.placeholder(tf.bool, name='is_train')

    with tf.variable_scope('NetworkParams'):
        W1 = tf.Variable(tf.truncated_normal([IMAGE_SIZE, LAYER1_UNITS]), name='W1')
        W2 = tf.Variable(tf.truncated_normal([LAYER1_UNITS, LAYER2_UNITS]), name='W2')
        W3 = tf.Variable(tf.truncated_normal([LAYER2_UNITS, LAYER3_UNITS]), name='W3')
        b3 = tf.Variable(tf.zeros(LAYER3_UNITS), name='b3')
        # params = [W1, W2, W3, b3]

    def batch_normalization(X, is_train):
        eps = EPSILON
        output_dim = int(X.shape[-1])

        gamma = tf.Variable(tf.truncated_normal([output_dim], stddev=0.1, mean=1.0), name='gamma')
        beta = tf.Variable(tf.zeros([output_dim]), name='beta')

        pop_mean = tf.Variable(tf.zeros([output_dim]), trainable=False)
        pop_var = tf.Variable(tf.ones([output_dim]), trainable=False)

        def sample():
            mean_X, var_X = tf.nn.moments(X, [0])
            # Save exponential average
            moving_avg_op = tf.group(
                pop_mean.assign(DECAY * pop_mean + (1 - DECAY) * mean_X),
                pop_var.assign(DECAY * pop_var + (1 - DECAY) * var_X),
            )

            with tf.control_dependencies([moving_avg_op]):
                batch_size = tf.to_float(tf.shape(X)[0])
                unbiased_var_X = batch_size / (batch_size - 1) * var_X
                return tf.nn.batch_normalization(X, mean_X, var_X, beta, gamma, eps)

        def population():
            return tf.nn.batch_normalization(X, pop_mean, pop_var, beta, gamma, eps)

        return tf.cond(is_train, lambda: sample(), lambda: population())


    u1 = batch_normalization(tf.matmul(images, W1), is_train)
    z1 = tf.nn.relu(u1)
    u2 = batch_normalization(tf.matmul(z1, W2), is_train)
    z2 = tf.nn.relu(u2)
    u3 = tf.matmul(z2, W3) + b3
    y = u3
    # y: [BATCH_SIZE x category_size]

    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=_labels))
    predicted = tf.argmax(y, axis=1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, labels), tf.float32))

    # Update network params
    params = tf.trainable_variables()
#     grads = tf.gradients(cost, params)
#     updates = [v.assign_add(- ETA * gv) for v, gv in zip(params, grads)]
#     train = tf.group(*updates)
    train = EveOptimizer().minimize(cost, params)
#     train = tf.train.AdamOptimizer().minimize(cost)

    # Prepare data
    t_X, v_X, t_y, v_y = train_test_split(train_X, train_y)
    
    # Training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        train_summary_writer = tf.summary.FileWriter("./optimizer/train", sess.graph)

        EPOCHS = 200
        BATCH_SIZE = 30
        STEPS_IN_EPOCHS = len(t_X) // BATCH_SIZE

        for epoch in range(EPOCHS):
            # Train
            epoch_loss = 0
            epoch_accuracy = 0
            for step in range(STEPS_IN_EPOCHS):
                start_idx = step * BATCH_SIZE
                end_idx = start_idx + BATCH_SIZE

                batch_X, batch_y = train_X[start_idx:end_idx], train_y[start_idx:end_idx]
                _, c, a = sess.run([train, cost, accuracy],
                                   feed_dict={images: t_X[start_idx:end_idx],
                                              labels: t_y[start_idx:end_idx],
                                              is_train: True})
                epoch_loss += c
                epoch_accuracy += a

            c, p = sess.run([cost, predicted],
                            feed_dict={images: v_X, labels: v_y, is_train: False})
            if epoch % 10 == 0:
                format_str = "Epoch %d   --- Train loss: %f   ---Train accuracy: %f   ---Test loss: %f   --- F: %f"
                print(format_str % (epoch, epoch_loss, epoch_accuracy / step, c, f1_score(v_y, p, average='macro')))
                
        train_summary_writer.close()

        # Evaluation
        pred_y = sess.run(predicted, feed_dict={images: test_X, is_train: False})
        elapsed_time = (time.time() - start_time) / 60  # min
        print("Elapsed time: %f min" % elapsed_time)
        return pred_y


In [5]:
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.util import nest


def _var_key(var):
    return (var.op.graph, var.op.name)


def _valid_dtypes():
    # Valid types for loss, variables, and gradients
    return set([tf.float16, tf.float32, tf.float64])


def _assert_valid_dtypes(tensors):
    valid_dtypes = _valid_dtypes()
    for t in tensors:
        dtype = t.dtype.base_dtype
        if dtype not in valid_dtypes:
            raise ValueError("Invalid type %r for %s, expected: %s"
                             % (dtype, t.name, [v for v in valid_dtypes]))


class EveOptimizer:
    def __init__(self, learning_rate=.001, beta1=0.9, beta2=0.999, beta3=0.999, epsilon=1e-8,
                 k=.1, K=10, name="Eve"):
        self._lr = tf.constant(learning_rate, name='learning_rate')
        self._beta1 = tf.constant(beta1, name='beta1')
        self._beta2 = tf.constant(beta2, name='beta2')
        self._beta3 = tf.constant(beta3, name='beta3')
        self._epsilon = tf.constant(epsilon, name='epsilon')
        self._k = k
        self._K = K

        self._beta1_power = None
        self._beta2_power = None
        self._f1_hat = None
        self._step = None

        # {slot1: {var1: xxx, var2: xxx, ...}, slot2: {var1: xxx, var2: xxx, ...}, ...}
        self._slots = {}

    def _get_beta_accumulators(self):
        return self._beta1_power, self._beta2_power

    def _accumulate_beta(self):
        op1 = tf.assign(self._beta1_power, self._beta1_power * self._beta1)
        op2 = tf.assign(self._beta2_power, self._beta2_power * self._beta2)
        return tf.group(op1, op2)

    def _create_algo_params(self, var_list):
        if self._beta1_power is None or self._beta1_power.graph is not var_list[0].graph:
            with ops.colocate_with(var_list[0]):
                with tf.name_scope('Eve'):
                    self._beta1_power = tf.Variable(1., name='beta1_power',
                                                    trainable=False)
                    self._beta2_power = tf.Variable(1., name='beta2_power',
                                                    trainable=False)
                    self._f1_hat = tf.Variable(0., name='f1_hat', trainable=False)
                    self._step = tf.Variable(1, name='step', trainable=False)

        for v in var_list:
            with tf.name_scope('Eve'):
                self._create_slot(v, tf.zeros(v.shape), "m")
                self._create_slot(v, tf.zeros(v.shape), "v")
                self._create_slot(v, tf.ones(v.shape), "d")

    def _create_slot(self, var, initializer, slot_name):
        """
        Create slots only in var's slot hasn't been registered.
        """
        if slot_name not in self._slots.keys():
            self._slots[slot_name] = {}
        named_slot = self._slots[slot_name]

        if _var_key(var) not in named_slot:
            with tf.variable_scope(slot_name):
                named_slot[_var_key(var)] = tf.Variable(initializer, name=var.op.name, trainable=False)

    def _get_slot(self, var, slot_name):
        named_slot = self._slots[slot_name]
        return named_slot[_var_key(var)]

    def _compute_gradients(self, cost, var_list=None):
        _assert_valid_dtypes([cost])
        if var_list is None:
            var_list = tf.trainable_variables()
        else:
            nest.flatten(var_list)
        grads = tf.gradients(cost, var_list)
        grads_and_vars = list(zip(grads, var_list))
        return grads_and_vars

    def _apply_gradients(self, grads_and_vars, cost):
        """
        grads_and_vars: [(g1, v1), (g2, v2), ...]
        """
        var_list = [var for g, var in grads_and_vars]
        self._create_algo_params(var_list)

        beta_acc_op = self._accumulate_beta()
        updates = []

        with tf.control_dependencies([beta_acc_op]):
            beta1_pow, beta2_pow = self._get_beta_accumulators()
            for g, var in grads_and_vars:
                m = self._get_slot(var, "m")

                update_m = tf.assign(m, self._beta1 * m + (1 - self._beta1) * g)
                with tf.control_dependencies([update_m]):
                    updates.append(tf.assert_greater(1 - beta1_pow, 0.))
                    m_t_hat = m / (1 - beta1_pow)

                v = self._get_slot(var, "v")
                update_v = tf.assign(v, self._beta2 * v + (1 - self._beta2) * g * g)
                with tf.control_dependencies([update_v]):
                    updates.append(tf.assert_greater(1 - beta2_pow, 0.))
                    v_t_hat = v / (1 - beta2_pow)

                def first_step():
                    op1 = tf.assign(self._f1_hat, cost)
                    op2 = tf.assign(d, tf.ones(d.shape))
                    return tf.group(op1, op2)

                def other_step():
                    delte, Delte = tf.cond(
                        tf.greater_equal(cost, self._f1_hat),
                        lambda: (tf.constant(self._k + 1.), tf.constant(self._K + 1.)),
                        lambda: (tf.constant(1. / (self._K + 1.)), tf.constant(1. / (self._k + 1))))
                    c_t = tf.minimum(tf.maximum(delte, cost / self._f1_hat), Delte)
                    f_1_hat = c_t * self._f1_hat
                    r_t = tf.abs(f_1_hat - self._f1_hat) / tf.minimum(f_1_hat, self._f1_hat)

                    op1 = tf.assign(d, self._beta3 * d + (1 - self._beta3) * r_t)
                    op2 = tf.assign(self._f1_hat, f_1_hat)
                    return tf.group(op1, op2)

                d = self._get_slot(var, "d")
                update_d = tf.cond(tf.greater(self._step, tf.constant(1, dtype=tf.int32)),
                                   other_step, first_step)
                with tf.control_dependencies([update_d]):
                    update_param = var.assign_add(
                        - self._lr * m_t_hat / (d * tf.sqrt(v_t_hat) + self._epsilon))
                updates.append(update_param)

            # Update self._step
            update_step = self._step.assign_add(1)
            updates.append(update_step)

        return tf.group(*updates)

    def minimize(self, cost, var_list=None):
        grads_and_vars = self._compute_gradients(cost, var_list)
        updates = self._apply_gradients(grads_and_vars, cost)
        return updates


In [6]:
validate_homework()

Epoch 0   --- Train loss: 688.653606   ---Train accuracy: 0.276882   ---Test loss: 4.359731   --- F: 0.416045
Epoch 10   --- Train loss: 16.627619   ---Train accuracy: 0.981452   ---Test loss: 0.568213   --- F: 0.861912
Epoch 20   --- Train loss: 1.548400   ---Train accuracy: 1.008065   ---Test loss: 0.530950   --- F: 0.882425
Epoch 30   --- Train loss: 0.361680   ---Train accuracy: 1.008065   ---Test loss: 0.541090   --- F: 0.891300


KeyboardInterrupt: 