#### Assignment by Ninad Sharma

### Fail 1 : Decreasing the number of layer increases the accuracy.

In [2]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# this is a super deep model, cool!
n_units = 100
n_layers = 8
w_range = 0.4

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 163.30819702148438 Accuracy: 0.15625
Loss: nan Accuracy: 0.125
Loss: nan Accuracy: 0.0859375
Loss: nan Accuracy: 0.078125
Loss: nan Accuracy: 0.125
Loss: nan Accuracy: 0.0625
Loss: nan Accuracy: 0.0859375
Loss: nan Accuracy: 0.1328125
Loss: nan Accuracy: 0.09375
Loss: nan Accuracy: 0.0703125
Loss: nan Accuracy: 0.109375
Loss: nan Accuracy: 0.1171875
Loss: nan Accuracy: 0.078125
Loss: nan Accuracy: 0.0859375
Loss: nan Accuracy: 0.0546875
Loss: nan Accuracy: 0.0859375
Loss: nan Accuracy: 0.1171875
Loss: nan Accuracy: 0.09375
Loss: nan Accuracy: 0.0859375
Loss: nan Accuracy: 0.1171875
Loss: nan Accuracy: 0.09375
Final test accuracy: 0.09799999743700027


### Solution : Fail 1 - Decreasing layers to 3 to overcome underfitting

In [7]:
## import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# this is a super deep model, cool!
n_units = 100

#Changed the number of layer as more layers causing underfitting.
n_layers = 3
w_range = 0.4

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 12.787841796875 Accuracy: 0.0859375
Loss: 0.3636283278465271 Accuracy: 0.8671875
Loss: 0.45800936222076416 Accuracy: 0.8515625
Loss: 0.4662801921367645 Accuracy: 0.8515625
Loss: 0.33751097321510315 Accuracy: 0.8984375
Loss: 0.3221946060657501 Accuracy: 0.890625
Loss: 0.21151258051395416 Accuracy: 0.9140625
Loss: 0.14299458265304565 Accuracy: 0.9609375
Loss: 0.22439751029014587 Accuracy: 0.9609375
Loss: 0.19175398349761963 Accuracy: 0.9375
Loss: 0.14937138557434082 Accuracy: 0.9609375
Loss: 0.18935813009738922 Accuracy: 0.953125
Loss: 0.1747705191373825 Accuracy: 0.9453125
Loss: 0.09077250957489014 Accuracy: 0.9609375
Loss: 0.16529391705989838 Accuracy: 0.9453125
Loss: 0.23267945647239685 Accuracy: 0.90625
Loss: 0.13545885682106018 Accuracy: 0.9375
Loss: 0.10813656449317932 Accuracy: 0.9609375
Loss: 0.1585591435432434 Accuracy: 0.9375
Loss: 0.08751757442951202 Accuracy: 0.9609375
Loss: 0.18257638812065125 Accuracy: 0.9609375
Final test accuracy: 0.9478999972343445


### Fail 2

In [8]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# this is a super deep model, cool!
n_units = 100
n_layers = 8
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.sigmoid(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.3402082920074463 Accuracy: 0.0859375
Loss: 2.3009932041168213 Accuracy: 0.0390625
Loss: 2.304021120071411 Accuracy: 0.0859375
Loss: 2.308529853820801 Accuracy: 0.125
Loss: 2.297663688659668 Accuracy: 0.0703125
Loss: 2.3047728538513184 Accuracy: 0.0546875
Loss: 2.3064308166503906 Accuracy: 0.09375
Loss: 2.304222583770752 Accuracy: 0.125
Loss: 2.3109912872314453 Accuracy: 0.078125
Loss: 2.299231767654419 Accuracy: 0.09375
Loss: 2.304865598678589 Accuracy: 0.0703125
Loss: 2.2999489307403564 Accuracy: 0.125
Loss: 2.309210777282715 Accuracy: 0.0546875
Loss: 2.3028573989868164 Accuracy: 0.1171875
Loss: 2.3189520835876465 Accuracy: 0.0625
Loss: 2.3034276962280273 Accuracy: 0.1015625
Loss: 2.2902181148529053 Accuracy: 0.1328125
Loss: 2.3022537231445312 Accuracy: 0.046875
Loss: 2.307250499725342 Accuracy: 0.1171875
Loss: 2.307234287261963 Accuracy: 0.09375
Loss: 2.2998483180999756 Accuracy: 0.109375
Final test accuracy: 0.11349999904632568


### Solution : Fail 2 - Changing the activation function from sigmoid to relu increases the accuracy. With sigmoid it might be facing the vanishing gradient problem..Relu reduces the likelihood of vanishing gradient.

In [9]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# this is a super deep model, cool!
n_units = 100
n_layers = 8
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        #CHANGED THE ACTIVATION FUNCTION
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.3025951385498047 Accuracy: 0.1015625
Loss: 2.296915292739868 Accuracy: 0.140625
Loss: 2.2992255687713623 Accuracy: 0.15625
Loss: 2.2976016998291016 Accuracy: 0.15625
Loss: 2.30326771736145 Accuracy: 0.0859375
Loss: 2.299046516418457 Accuracy: 0.140625
Loss: 2.2967658042907715 Accuracy: 0.1171875
Loss: 2.2984213829040527 Accuracy: 0.1015625
Loss: 2.292041063308716 Accuracy: 0.109375
Loss: 2.2762982845306396 Accuracy: 0.1796875
Loss: 2.0958619117736816 Accuracy: 0.21875
Loss: 2.116971969604492 Accuracy: 0.1953125
Loss: 1.4607012271881104 Accuracy: 0.4140625
Loss: 0.9960353374481201 Accuracy: 0.515625
Loss: 1.1854119300842285 Accuracy: 0.578125
Loss: 0.889000654220581 Accuracy: 0.703125
Loss: 0.44809359312057495 Accuracy: 0.890625
Loss: 0.3198297321796417 Accuracy: 0.8671875
Loss: 0.5213542580604553 Accuracy: 0.90625
Loss: 0.21296259760856628 Accuracy: 0.9453125
Loss: 0.34347230195999146 Accuracy: 0.9296875
Final test accuracy: 0.9279000163078308


### Fail 3

In [10]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# uhm, maybe don't use that many layers actually. 2 is fine!
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, 0.),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, 0.),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, 0.),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.3025853633880615 Accuracy: 0.078125
Loss: 2.3031349182128906 Accuracy: 0.0859375
Loss: 2.296309232711792 Accuracy: 0.1171875
Loss: 2.306762456893921 Accuracy: 0.1015625
Loss: 2.294363021850586 Accuracy: 0.1640625
Loss: 2.3034300804138184 Accuracy: 0.109375
Loss: 2.296173095703125 Accuracy: 0.140625
Loss: 2.3006398677825928 Accuracy: 0.140625
Loss: 2.30497670173645 Accuracy: 0.0703125
Loss: 2.296168327331543 Accuracy: 0.140625
Loss: 2.294894218444824 Accuracy: 0.1640625
Loss: 2.305692195892334 Accuracy: 0.0625
Loss: 2.305344343185425 Accuracy: 0.109375
Loss: 2.2920262813568115 Accuracy: 0.125
Loss: 2.303922176361084 Accuracy: 0.09375
Loss: 2.3135147094726562 Accuracy: 0.09375
Loss: 2.2952752113342285 Accuracy: 0.140625
Loss: 2.3022637367248535 Accuracy: 0.1015625
Loss: 2.303586006164551 Accuracy: 0.0703125
Loss: 2.3005101680755615 Accuracy: 0.15625
Loss: 2.304555654525757 Accuracy: 0.1328125
Final test accuracy: 0.11349999904632568


### Solution : Fail 3 - Changed the weight ranges as previous ones were too close to 0 and weight initialization between -0.1 and 0 for relu does not make sense as layer will be idenctical

In [38]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# uhm, maybe don't use that many layers actually. 2 is fine!
n_units = 100
n_layers = 2
w_range = 0.01

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.3025810718536377 Accuracy: 0.0546875
Loss: 2.299225330352783 Accuracy: 0.1015625
Loss: 2.3023781776428223 Accuracy: 0.0859375
Loss: 2.293426036834717 Accuracy: 0.171875
Loss: 2.299144744873047 Accuracy: 0.1171875
Loss: 2.288865089416504 Accuracy: 0.140625
Loss: 1.9467549324035645 Accuracy: 0.359375
Loss: 1.095376968383789 Accuracy: 0.53125
Loss: 0.8453824520111084 Accuracy: 0.7265625
Loss: 0.6327615976333618 Accuracy: 0.8046875
Loss: 0.4469195008277893 Accuracy: 0.8671875
Loss: 0.5518369078636169 Accuracy: 0.8515625
Loss: 0.3905937075614929 Accuracy: 0.8828125
Loss: 0.500653088092804 Accuracy: 0.828125
Loss: 0.33236977458000183 Accuracy: 0.890625
Loss: 0.32455775141716003 Accuracy: 0.8984375
Loss: 0.19593912363052368 Accuracy: 0.9609375
Loss: 0.2830863296985626 Accuracy: 0.9140625
Loss: 0.17137017846107483 Accuracy: 0.9609375
Loss: 0.1530275046825409 Accuracy: 0.9609375
Loss: 0.40282297134399414 Accuracy: 0.8671875
Final test accuracy: 0.9207000136375427


In [14]:
%reload_ext tensorboard

### Fail 4

In [22]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# 2 layers again
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    # I hear that adding noise to the inputs improves generalization!
    img_batch += tf.random.normal(tf.shape(img_batch), stddev=4.)

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.749685764312744 Accuracy: 0.09375
Loss: 2.2981228828430176 Accuracy: 0.15625
Loss: 2.3113040924072266 Accuracy: 0.15625
Loss: 2.2835793495178223 Accuracy: 0.109375
Loss: 2.2026827335357666 Accuracy: 0.1875
Loss: 2.276700496673584 Accuracy: 0.140625
Loss: 2.1818060874938965 Accuracy: 0.1640625
Loss: 2.1555542945861816 Accuracy: 0.2578125
Loss: 2.1897084712982178 Accuracy: 0.15625
Loss: 2.1078147888183594 Accuracy: 0.2421875
Loss: 2.1070377826690674 Accuracy: 0.2578125
Loss: 2.035621166229248 Accuracy: 0.3046875
Loss: 2.184082269668579 Accuracy: 0.1640625
Loss: 2.2011094093322754 Accuracy: 0.21875
Loss: 2.188337802886963 Accuracy: 0.234375
Loss: 2.1732165813446045 Accuracy: 0.140625
Loss: 2.1198885440826416 Accuracy: 0.2265625
Loss: 2.10296368598938 Accuracy: 0.203125
Loss: 2.13723087310791 Accuracy: 0.1796875
Loss: 2.081315040588379 Accuracy: 0.28125
Loss: 2.168562889099121 Accuracy: 0.25
Final test accuracy: 0.6108999848365784


### Solution : Fail 4 - Reduce the stddev value to 0.5, reduce the noise as more noise provides less training accuracy

In [23]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# 2 layers again
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    #given Stddev of 4, image is too noisy that the training accuracy is low. 
    img_batch += tf.random.normal(tf.shape(img_batch), stddev=0.5)

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.3356387615203857 Accuracy: 0.1015625
Loss: 1.121335506439209 Accuracy: 0.609375
Loss: 0.6422402858734131 Accuracy: 0.78125
Loss: 0.5979484915733337 Accuracy: 0.78125
Loss: 0.5210495591163635 Accuracy: 0.8671875
Loss: 0.5863873362541199 Accuracy: 0.7890625
Loss: 0.5431021451950073 Accuracy: 0.8359375
Loss: 0.572018027305603 Accuracy: 0.8125
Loss: 0.48237600922584534 Accuracy: 0.8515625
Loss: 0.3483855426311493 Accuracy: 0.890625
Loss: 0.3470909595489502 Accuracy: 0.90625
Loss: 0.4486326575279236 Accuracy: 0.8828125
Loss: 0.3521977663040161 Accuracy: 0.8984375
Loss: 0.3858810365200043 Accuracy: 0.859375
Loss: 0.3736230134963989 Accuracy: 0.859375
Loss: 0.297371506690979 Accuracy: 0.8671875
Loss: 0.332017183303833 Accuracy: 0.8984375
Loss: 0.26867228746414185 Accuracy: 0.90625
Loss: 0.28250762820243835 Accuracy: 0.90625
Loss: 0.34364911913871765 Accuracy: 0.8828125
Loss: 0.3764675259590149 Accuracy: 0.8984375
Final test accuracy: 0.946399986743927


### Fail 5 

In [24]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# 2 layers again
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    # finally, the softmax classification output layer :)))
    logits = tf.nn.softmax(tf.matmul(x, layers[-1][0]) + layers[-1][1])

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.302830457687378 Accuracy: 0.1171875
Loss: 2.2961483001708984 Accuracy: 0.203125
Loss: 2.2819950580596924 Accuracy: 0.296875
Loss: 2.2243363857269287 Accuracy: 0.25
Loss: 2.172560691833496 Accuracy: 0.3046875
Loss: 2.0000391006469727 Accuracy: 0.4609375
Loss: 1.8660151958465576 Accuracy: 0.6484375
Loss: 1.7871018648147583 Accuracy: 0.75
Loss: 1.664069414138794 Accuracy: 0.84375
Loss: 1.710860252380371 Accuracy: 0.7734375
Loss: 1.640804409980774 Accuracy: 0.828125
Loss: 1.6324231624603271 Accuracy: 0.8515625
Loss: 1.7087624073028564 Accuracy: 0.7734375
Loss: 1.7050268650054932 Accuracy: 0.78125
Loss: 1.634212851524353 Accuracy: 0.8359375
Loss: 1.6498050689697266 Accuracy: 0.8359375
Loss: 1.6286778450012207 Accuracy: 0.8359375
Loss: 1.6198831796646118 Accuracy: 0.859375
Loss: 1.683353066444397 Accuracy: 0.7890625
Loss: 1.6279175281524658 Accuracy: 0.8203125
Loss: 1.6380990743637085 Accuracy: 0.8203125
Final test accuracy: 0.847599983215332


### Solution : Fail 5 - Removing the activation function on the output layer as it show provide only raw probs i.e Logits

In [25]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# 2 layers again
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    # Removed the activation function
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.313460111618042 Accuracy: 0.078125
Loss: 0.6820845603942871 Accuracy: 0.8203125
Loss: 0.4834917187690735 Accuracy: 0.84375
Loss: 0.285474956035614 Accuracy: 0.90625
Loss: 0.34293776750564575 Accuracy: 0.8984375
Loss: 0.30504709482192993 Accuracy: 0.8828125
Loss: 0.2745201587677002 Accuracy: 0.90625
Loss: 0.3351789712905884 Accuracy: 0.8984375
Loss: 0.29596611857414246 Accuracy: 0.921875
Loss: 0.2680651843547821 Accuracy: 0.921875
Loss: 0.15021759271621704 Accuracy: 0.9453125
Loss: 0.16780433058738708 Accuracy: 0.9453125
Loss: 0.2772422432899475 Accuracy: 0.8984375
Loss: 0.12987565994262695 Accuracy: 0.9609375
Loss: 0.2159978747367859 Accuracy: 0.9453125
Loss: 0.1550396829843521 Accuracy: 0.9453125
Loss: 0.1839185655117035 Accuracy: 0.921875
Loss: 0.1274910718202591 Accuracy: 0.953125
Loss: 0.14891618490219116 Accuracy: 0.9453125
Loss: 0.13094991445541382 Accuracy: 0.96875
Loss: 0.2012675404548645 Accuracy: 0.953125
Final test accuracy: 0.9593999981880188
