In [None]:
import tensorflow as tf

# There is a dataset with training and validation split prepared
# The dataset consists of images of hand-written digits
from tensorflow.examples.tutorials.mnist import input_data

# The read_data_sets operation will download the dataset to the given folder.
# The one_hot boolean marks that labels are one-hot encoded. One-hot encoded means, 
# that the class is presented as an index of a list of ones and zeros. For example, 
# when the true label is 3 of 10 classes then the one-hot encoding is
#  0  1  2  3  4  5  6  7  8  9
# [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

In [None]:
# The reset operation is always useful, so that the cell can be executed several times
# Otherwise an error will be thrown that variables already exists within the graph
tf.reset_default_graph()

# next_batch(N) returns N examples as tuples of (image, label)
# per default the examples are shuffled. 
# here the training split mnist.train is referenced
# N is also called the batch-size
images, labels = mnist.train.next_batch(256)

# the input images are of dimensions 28x28 = 784
# so the first layer has 784 connections to each neuron
# here the number of neurons is set to 100 for the first layer
weights1 = tf.get_variable("weights1", (784, 100))
# 256 x 100 = 256 x 784 @ 784 x 100
layer1 = tf.nn.relu(images @ weights1)

# the first layer has 100 neurons, so there are 100 connection to each neuron
# of the second layer.  here the number of neurons is set to 100 for the second layer
weights2 = tf.get_variable("weights2", (100, 100))
# 256 x 100 = 256 x 100 @ 100 x 100
layer2 = tf.nn.relu(layer1 @ weights2)

# the second layer has 100 neurons, so there are 100 connection to each neuron
# of the third layer. here the number of neurons is set to 100 for the second layer
weights3 = tf.get_variable("weights3", (100, 10))
# here no activation is applied, because the loss function is applying softmax
# 256 x 10 = 256 x 100 @ 100 x 10
outputs = layer2 @ weights3

# the loss function wants only the logits (linear outputs) of the last layer
loss = tf.losses.softmax_cross_entropy(labels, outputs)

# the optimizer handles all the work for training
train_op = tf.train.AdamOptimizer().minimize(loss)

# we wanna run 100 steps of training (gradient updates for our weights)
total_steps = 100
# above we only constructed the graph
# the computional graph has to be executed in a session
with tf.Session() as sess:
    # the variables always must be initialized
    # a common initializer is random_uniform
    # this is also important for symmetry breaking
    sess.run(tf.global_variables_initializer())
    # perform the actual training steps
    for step in range(total_steps):
        sess.run([train_op])