In [2]:
# import the packages

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist

In [3]:
# Load the data and split it between train and test sets
# Keras provides MNIST as
# Training set: 60,000 images
# Test set: 10,000 images

(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [4]:
x_train.shape
# 6000 samples, deminsion of each image is 28 * 28

(60000, 28, 28)

In [6]:
x_train[0]
# the data of the first image

# each pixel is stored as one byte indicating brightness:
# 0 = black
# 255 = white
# values in between = shades of gray

In [7]:
# Normalize pixel values (from 0–255 to 0–1)
x_train = x_train/ 255.0
x_test = x_test/ 255.0

In [8]:
x_train[0]

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.    

In [9]:
# Set model / data parameters
input_shape = (28, 28, 1)
num_classes = 10

In [10]:
# Build the Model Pipeline

# keras.Sequential: add layers from top to bottom, and data flows through the layers in the same order.

# Flatten: converts a multi-dimensional tensor (e.g., a 28×28 image) into a 1-D vector.

# Dense: (fully connected layer) connects every input neuron to every output neuron, with weights and biases.
# and It is the most common and frequently used layer.

model = keras.Sequential([
    keras.Input(shape=input_shape),  # Input layer
    layers.Flatten(),                # Flatten 28×28 image → 784 vector
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(num_classes, activation='softmax')  # Output layer, softmax: the probabilities of 10 classes
])


In [19]:
# Compile the Model

# optimizers: how to update the weight
# SGD (Stochastic Gradient Descent): simple, classical method
# Adam: most popular optimizer

# learning rate: controls how big a step the optimizer takes when updating the model’s weights during training
# The optimizer updates weights: w = w − α⋅gradient
# Where α is the learning rate

optimizer = keras.optimizers.SGD(learning_rate=0.001)

model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [13]:
# Train the Model

# batch_size: the number of samples per weight update
# epoch: complete pass through the entire training dataset

history = model.fit(
    x_train, y_train,
    batch_size=128,
    epochs=5,
    validation_split=0.1
)

Epoch 1/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8939 - loss: 0.3826 - val_accuracy: 0.9192 - val_loss: 0.2898
Epoch 2/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8997 - loss: 0.3479 - val_accuracy: 0.9255 - val_loss: 0.2657
Epoch 3/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9085 - loss: 0.3189 - val_accuracy: 0.9278 - val_loss: 0.2496
Epoch 4/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9139 - loss: 0.2991 - val_accuracy: 0.9328 - val_loss: 0.2355
Epoch 5/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9181 - loss: 0.2813 - val_accuracy: 0.9367 - val_loss: 0.2237


In [14]:
# Evaluate the Model
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
print("Test accuracy:", test_acc)

313/313 - 1s - 2ms/step - accuracy: 0.9259 - loss: 0.2607
Test accuracy: 0.9258999824523926


In [15]:
# Predict on test dataset
predictions = model.predict(x_test)

# show 20 samples
print("Predicted labels:", predictions.argmax(axis=1)[:20])
print("True labels:", y_test[:20])

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Predicted labels: [7 2 1 0 4 1 4 9 6 9 0 6 9 0 1 5 9 7 3 4]
True labels: [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]


In [16]:
model.summary()

In [None]:
# flatten layer: 784 features
# 1st dense layer (hidden layer): 784(input) * 128(weight) + 128 (bias) = 100,480 #param
# 2nd dense layer (hidden layer): 128(input) * 64(weight) + 64(bias) = 8,256 #param
# 3rd dense layer (hidden layer): 64(input) * 10(weight)+ 10(bias) = 650 #param
# the total trainable params: 100480+ 8256+ 650 = 109,386
