# Solving the MNIST Classification Problem using the TensorFlow Keras API
Learning material: [CS50 AI Lecture 5](https://cs50.harvard.edu/ai/2020/notes/5/)(I had previously taken this course)

In [49]:
from tensorflow import keras
import sklearn
import numpy as np

In [50]:
# use the MNIST black&white handwriting dataset already present in keras
mnist = keras.datasets.mnist
(X_train, Y_train), (X_test, Y_test) = mnist.load_data() # capital letters to indicate a vectorized version of the whole training data(the X in logistic regression)

X_train = X_train/255; X_test = X_test/255 # normalize the pixel values to be in [0, 1]

# what does this do?
Y_train = keras.utils.to_categorical(Y_train)
Y_test = keras.utils.to_categorical(Y_test)

# recast the 60,000x28x28 numpy array x_train into a 60000x28x28x1 numpy array
X_train = X_train.reshape(*(X_train.shape), 1)
X_test = X_test.reshape(*(X_test.shape), 1)
print("done loading")

done loading


In [54]:
# store Data about our model and how it performed with various values of hyperparameters.
# f = open("mnist.txt", mode='a')
# f.write('\n') # write writes to the end of a file

In [53]:
import keras.layers # for rich support of function definitions in VS Code

# Now let us create the NN model
model = keras.models.Sequential([
    # The convolution layer. 32 of these filters are learnt for a generic image. The input is one of the samples in X_train, which of course has the shape given by X_train.shape[1],[2],[3].
    keras.layers.Conv2D(filters=10, kernel_size=(5, 5), activation="relu", input_shape=(28, 28, 1)),#(X_train.shape[1], X_train.shape[2], X_train.shape[3])),
    # Pool the convoluted image.
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    # another round of convolute and pool.
    keras.layers.Conv2D(filters=24, kernel_size=(4, 4), activation="relu", input_shape=(28, 28, 1)),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    # Flatten the current image. The resulting vector is our input to the neural network. The first layer, if you say.
    keras.layers.Flatten(),
    # The hidden NN layer
    keras.layers.Dense(units=400, activation="relu"),
    # Add some dropout to prevent over-reliance on a few nodes.
    keras.layers.Dropout(0.4),
    # Add an output layer - a neuron for each of the 10 digits.
    keras.layers.Dense(units=10, activation="softmax")
])

model.summary()
# f.close()
# "compile" model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# train the model
model.fit(x=X_train, y=Y_train, epochs=5)

# Note: note that in the model summary we see the Dense Layer having number of nodes(400) * 385 param(eter)s. This is because the input has 384, and so size of vector w is 384, and of course we have the bias parameter b, giving the total of 385 per node. Also note that due to the large number of nodes, the number of parameters is by far the highest in the Dense layer of the whole architecture.
# note that the stride is (1, 1) in the Conv Layers.

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_26 (Conv2D)          (None, 24, 24, 10)        260       
                                                                 
 max_pooling2d_26 (MaxPoolin  (None, 12, 12, 10)       0         
 g2D)                                                            
                                                                 
 conv2d_27 (Conv2D)          (None, 9, 9, 24)          3864      
                                                                 
 max_pooling2d_27 (MaxPoolin  (None, 4, 4, 24)         0         
 g2D)                                                            
                                                                 
 flatten_13 (Flatten)        (None, 384)               0         
                                                                 
 dense_26 (Dense)            (None, 400)             

<keras.callbacks.History at 0x2b959c2e0>

In [None]:
# evaluate performance on test data
model.evaluate(x=X_test, y=Y_test, verbose=2)

313/313 - 1s - loss: 0.0246 - accuracy: 0.9920 - 507ms/epoch - 2ms/step


[0.02460998296737671, 0.9919999837875366]