## We will be training a model to classify MNIST (handwritten digits) dataset

In [1]:
import tensorflow as tf

In [5]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

#y is the label vector, and x is the feature vector (we divide by 255 to scale down each feature?)

In [11]:
#This model is a bit of a black box to me - I need to understand how to choose all of its components
tf.keras.backend.set_floatx('float64') #this prevents errors later
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'), # how was the # of nodes chosen, why relu activation function
  tf.keras.layers.Dropout(0.2), # what does this do
  tf.keras.layers.Dense(10) # has as many nodes as there are classes
])

In [12]:
#we havn't trained our model yet, so these predictions will be random
predictions = model(x_train[:1]).numpy()
predictions # these predictions are log-odds (they need to be scaled into a probabilty distribution using softmax)

array([[-0.36294173, -0.0435469 ,  0.44666042,  0.52221191,  0.12482971,
        -0.37519708, -0.13138243,  0.35013782, -0.15467523,  0.42845263]])

In [13]:
#since we are computing probabilities for each of the inputs being one of 10 different digits (but our model is untrained),
#the probabilty for each class will be close to 1/10 (which it is)
tf.nn.softmax(predictions).numpy()

array([[0.06096814, 0.08391013, 0.13699626, 0.14774756, 0.09929775,
        0.06022552, 0.07685425, 0.12439115, 0.07508479, 0.13452444]])

In [14]:
#This is the loss function which we use to optimize
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [15]:
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])
#here the loss function has been passed in - how do we choose a good loss function

In [16]:
#this is where the loss function will be minimized and parameters will be better fit
model.fit(x_train, y_train, epochs=5)

Train on 60000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x229ee74ec48>

In [17]:
# we are not doing a validation step here, so we can straight away test accuracy on test set:
model.evaluate(x_test,  y_test, verbose=2)

#accuracy is close to 98%, meaning our model is pretty good

10000/10000 - 1s - loss: 0.0722 - accuracy: 0.9775


[0.07216692353106384, 0.9775]

In [18]:
#this is how we get all the probablities for each class:
probability_model = tf.keras.Sequential([
  model,
  tf.keras.layers.Softmax()
])

In [19]:
probability_model(x_test[:5])

<tf.Tensor: shape=(5, 10), dtype=float64, numpy=
array([[8.93567303e-08, 2.68015582e-11, 1.30612728e-06, 9.35149404e-05,
        9.74078301e-12, 2.92919887e-06, 1.74252916e-13, 9.99897070e-01,
        2.68654307e-07, 4.82152628e-06],
       [7.32084134e-08, 5.11496860e-05, 9.99884982e-01, 6.31584905e-05,
        1.08465926e-15, 2.18668958e-08, 2.84809085e-09, 5.77169659e-11,
        6.11856344e-07, 1.87867403e-12],
       [1.18913064e-07, 9.97652463e-01, 2.29064344e-05, 7.49025582e-06,
        1.24060558e-05, 1.18153912e-06, 1.77678089e-05, 2.16934475e-03,
        1.13405562e-04, 2.91612071e-06],
       [9.99882110e-01, 7.33200234e-10, 2.19375454e-05, 1.76583274e-07,
        2.00471479e-09, 1.32350811e-06, 9.63259743e-06, 3.72278052e-05,
        1.89843062e-08, 4.75701574e-05],
       [2.08076411e-05, 1.98545971e-09, 1.20092929e-05, 2.00912800e-07,
        9.87442661e-01, 2.12319115e-07, 5.95662026e-06, 1.58254578e-04,
        8.35135331e-07, 1.23590601e-02]])>

In [20]:
y_test[0]

#if we look at the probabilities above for the x_test[0] case, we can see that the probability for "7" was the highest, 
#and we can see that the label for that specific case (labeled in y_test[0]) is indeed 7

7