In [1]:
from __future__         import print_function
import numpy            as np
#import tensorflow       as tf
from keras.datasets     import mnist                                # image dataset of handwritten numbers
from keras.models       import Sequential                           # model type
from keras.layers       import Dense, Activation, Dropout, Input    #from keras.layers.core import Dense, Activation
from keras.optimizers   import SGD                                  # activation function
from keras.utils        import to_categorical                       # one-hot encoding of ground truth values
# NOTE: TensorFlow backend used by Keras

np.random.seed(1671)  # for reproducibility

# network & training hyperparameters
VERBOSE = 1         # 0: no output, 1: progress bar, 2: one line per epoch
N_CLASSES = 10      # number of outputs = number of digits
# TODO: increase parameter values to improve accuracy
N_EPOCHS = 20       # number of times the model is trained
BATCH_SIZE = 128    # number of samples per gradient update before updating the weights
N_HIDDEN = 128      # number of neurons in each hidden layer
OPTIMIZER = SGD()   # optimizer  TODO: try Adam or RMSprop
VALIDATION_SPLIT = 0.2  # how much TRAIN is reserved for VALIDATION
# TODO: DROPOUT = 0.3   # regularization to prevent overfitting

# data: shuffled & split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()  # returns training & test data as tuples
# X_train is 60000 rows (images) of 28x28 values (array) + labels ---> N_INPUT_DIMS in 60000 x 784
# X_test is 10000 rows (images) of 28x28 values (array) + labels ---> N_INPUT_DIMS in 10000 x 784
N_INPUT_DIMS = 784  # flattened 2D array to 1D array
X_train = X_train.reshape(60000, N_INPUT_DIMS)  # 60000 x 784
X_test = X_test.reshape(10000, N_INPUT_DIMS)    # 10000 x 784
X_train = X_train.astype('float32')         # change type
X_test = X_test.astype('float32')           # change type
# normalize
X_train /= 255  # 0-255 to 0-1
X_test /= 255   # 0-255 to 0-1

print(X_train.shape[0], 'train samples')    # number of rows
print(X_test.shape[0], 'test samples')      # number of rows

#convert class vectors to binary class matrices
Y_train = to_categorical(y_train, N_CLASSES)  # 10 outputs
Y_test = to_categorical(y_test, N_CLASSES)    # 10 outputs

model = Sequential()    # model with linear stack of layers

# input layer
model.add(Dense(N_HIDDEN, input_shape=(N_INPUT_DIMS,))) # 784 input features connected to each of the 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# hidden layer #1
model.add(Dense(N_HIDDEN))          # hidden layer with 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# hidden layer #2
model.add(Dense(N_HIDDEN))          # hidden layer with 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# TODO: add more hidden dense layers and dropouts

# output layer
model.add(Dense(N_CLASSES))         # output layer with 10 outputs (each possible digit)
model.add(Activation('softmax'))    # converts outputs into probabilities (confidences) for each class that sum to 1

''' optional model & layer creation
model = Sequential([
    Input(shape=(N_INPUT_DIMS,)),               # input layer with number of neurons
    Dense(N_HIDDEN, activation='relu'),     # input layer with number of neurons & relu activation
    # TODO: add more hidden dense layers and dropouts here
    Dense(N_CLASSES, activation='softmax')  # output layer with number of outputs (each possible digit) & softmax activation
])  
'''

# Evaluate model
model.summary()     # prints a summary representation of the model

model.compile(loss='categorical_crossentropy',  # loss function (classification predictions)
              optimizer=OPTIMIZER,              # optimizer
              metrics=['accuracy'])             # metrics to be evaluated by the model during training and testing

history = model.fit(X_train, Y_train,                                   # training data
                    batch_size=BATCH_SIZE, epochs=N_EPOCHS,             # number of times the model is trained
                    verbose=VERBOSE, validation_split=VALIDATION_SPLIT) # how much TRAIN is reserved for VALIDATION

score = model.evaluate(X_test, Y_test, verbose=VERBOSE)     # test the model

print("Training accuracy:", history.history['accuracy'][-1])        # final training accuracy
print("Test accuracy:", score[1])  # test accuracy
print("Test score:", score[0])     # loss on test
print("Validation accuracy:", history.history['val_accuracy'][-1])  # final validation accuracy

# Goal: high training and validation accuracy (which means good generalization)
# Overfitting: large gap between training and validation accuracy (try L1 regression to nullify unaffective nodes or L2 for colinearity
# Underfitting: low training and validation accuracy
#------------------------------------------------
# RESULTS (6 significant digits)
#------------------------------------------------
'''
    BASELINE:           2 hidden layers, 128 neurons, 20 epochs, 128 batch size, 0.2 validation split,
                        SGD optimizer, relu activation, categorical_crossentropy loss, 784 input shape, 10 outputs
    Training accuracy:  0.954104
    Test accuracy:      0.951700
    Test score (loss):  0.162484  # lower loss means higher accuracy
    Validation accuracy:0.954833  # indicator of generalization accuracy
'''
pass    # omits outputting last unassigned string (in ''' block above)

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
60000 train samples
10000 test samples
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               100480    
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
activation_2 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
activation_3 (Activation)    (None, 128)               0         
___________

In [8]:
from __future__         import print_function
import numpy            as np
#import tensorflow       as tf
from keras.datasets     import mnist                                # image dataset of handwritten numbers
from keras.models       import Sequential                           # model type
from keras.layers       import Dense, Activation, Dropout, Input    #from keras.layers.core import Dense, Activation
from keras.optimizers   import SGD                                  # activation function
from keras.utils        import to_categorical                       # one-hot encoding of ground truth values
# NOTE: TensorFlow backend used by Keras

np.random.seed(1671)  # for reproducibility

# network & training hyperparameters
VERBOSE = 1         # 0: no output, 1: progress bar, 2: one line per epoch
N_CLASSES = 10      # number of outputs = number of digits
# TODO: increase parameter values to improve accuracy
N_EPOCHS = 20       # number of times the model is trained
BATCH_SIZE = 256    # number of samples per gradient update before updating the weights
N_HIDDEN = 128      # number of neurons in each hidden layer
OPTIMIZER = SGD()   # optimizer  TODO: try Adam or RMSprop
VALIDATION_SPLIT = 0.2  # how much TRAIN is reserved for VALIDATION
# TODO: DROPOUT = 0.3   # regularization to prevent overfitting

# data: shuffled & split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()  # returns training & test data as tuples
# X_train is 60000 rows (images) of 28x28 values (array) + labels ---> N_INPUT_DIMS in 60000 x 784
# X_test is 10000 rows (images) of 28x28 values (array) + labels ---> N_INPUT_DIMS in 10000 x 784
N_INPUT_DIMS = 784  # flattened 2D array to 1D array
X_train = X_train.reshape(60000, N_INPUT_DIMS)  # 60000 x 784
X_test = X_test.reshape(10000, N_INPUT_DIMS)    # 10000 x 784
X_train = X_train.astype('float32')         # change type
X_test = X_test.astype('float32')           # change type
# normalize
X_train /= 255  # 0-255 to 0-1
X_test /= 255   # 0-255 to 0-1

print(X_train.shape[0], 'train samples')    # number of rows
print(X_test.shape[0], 'test samples')      # number of rows

#convert class vectors to binary class matrices
Y_train = to_categorical(y_train, N_CLASSES)  # 10 outputs
Y_test = to_categorical(y_test, N_CLASSES)    # 10 outputs

model = Sequential()    # model with linear stack of layers

# input layer
model.add(Dense(N_HIDDEN, input_shape=(N_INPUT_DIMS,))) # 784 input features connected to each of the 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# hidden layer #1
model.add(Dense(N_HIDDEN))          # hidden layer with 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# hidden layer #2
model.add(Dense(N_HIDDEN))          # hidden layer with 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# TODO: add more hidden dense layers and dropouts

# output layer
model.add(Dense(N_CLASSES))         # output layer with 10 outputs (each possible digit)
model.add(Activation('softmax'))    # converts outputs into probabilities (confidences) for each class that sum to 1

''' optional model & layer creation
model = Sequential([
    Input(shape=(N_INPUT_DIMS,)),           # input layer with number of neurons
    Dense(N_HIDDEN, activation='relu'),     # input layer with number of neurons & relu activation
    # TODO: add more hidden dense layers and dropouts here
    Dense(N_CLASSES, activation='softmax')  # output layer with number of outputs (each possible digit) & softmax activation
])  
'''

# Evaluate model
model.summary()     # prints a summary representation of the model

model.compile(loss='categorical_crossentropy',  # loss function
              optimizer=OPTIMIZER,              # optimizer
              metrics=['accuracy'])             # metrics to be evaluated by the model during training and testing

history = model.fit(X_train, Y_train,                                   # training data
                    batch_size=BATCH_SIZE, epochs=N_EPOCHS,             # number of times the model is trained
                    verbose=VERBOSE, validation_split=VALIDATION_SPLIT) # how much TRAIN is reserved for VALIDATION

score = model.evaluate(X_test, Y_test, verbose=VERBOSE)     # test the model

print("Training accuracy:", round(history.history['accuracy'][-1],6))        # final training accuracy
print("Test accuracy:", round(score[1],6))         # test accuracy
print("Test score (loss):", round(score[0],6))     # loss on test
print("Validation accuracy:", round(history.history['val_accuracy'][-1],6))  # final validation accuracy

# Goal: high training and validation accuracy (which means good generalization)
# Overfitting: large gap between training and validation accuracy (try L1 regression to nullify unaffective nodes or L2 for colinearity
# Underfitting: low training and validation accuracy
#------------------------------------------------
# RESULTS (6 significant digits)
#------------------------------------------------
'''
    BASELINE:           2 hidden layers, 128 neurons, 20 epochs, 128 batch size, 0.2 validation split,
                        SGD optimizer, relu activation, categorical_crossentropy loss, 784 input shape, 10 outputs
    
    Training accuracy:  0.954104
    Test accuracy:      0.951700
    Test score (loss):  0.162484  # lower loss means higher accuracy
    Validation accuracy:0.954833  # indicator of generalization accuracy

    Experiments:        hyperparameter=BATCH_SIZE : original=128
    -------------------------------------------------------------
    EXP #1:             new=256
    Training accuracy:  0.932208
    Test accuracy:      0.9328
    Test score (loss):  0.229173
    Validation accuracy:0.935667
'''
pass    # omits outputting last unassigned string (in ''' block above)

60000 train samples
10000 test samples
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_37 (Dense)             (None, 128)               100480    
_________________________________________________________________
activation_37 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_38 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_38 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_39 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_40 (Dense

In [7]:
from __future__         import print_function
import numpy            as np
#import tensorflow       as tf
from keras.datasets     import mnist                                # image dataset of handwritten numbers
from keras.models       import Sequential                           # model type
from keras.layers       import Dense, Activation, Dropout, Input    #from keras.layers.core import Dense, Activation
from keras.optimizers   import SGD                                  # activation function
from keras.utils        import to_categorical                       # one-hot encoding of ground truth values
# NOTE: TensorFlow backend used by Keras

np.random.seed(1671)  # for reproducibility

# network & training hyperparameters
VERBOSE = 1         # 0: no output, 1: progress bar, 2: one line per epoch
N_CLASSES = 10      # number of outputs = number of digits
# TODO: increase parameter values to improve accuracy
N_EPOCHS = 20       # number of times the model is trained
BATCH_SIZE = 8    # number of samples per gradient update before updating the weights
N_HIDDEN = 128      # number of neurons in each hidden layer
OPTIMIZER = SGD()   # optimizer  TODO: try Adam or RMSprop
VALIDATION_SPLIT = 0.2  # how much TRAIN is reserved for VALIDATION
# TODO: DROPOUT = 0.3   # regularization to prevent overfitting

# data: shuffled & split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()  # returns training & test data as tuples
# X_train is 60000 rows (images) of 28x28 values (array) + labels ---> N_INPUT_DIMS in 60000 x 784
# X_test is 10000 rows (images) of 28x28 values (array) + labels ---> N_INPUT_DIMS in 10000 x 784
N_INPUT_DIMS = 784  # flattened 2D array to 1D array
X_train = X_train.reshape(60000, N_INPUT_DIMS)  # 60000 x 784
X_test = X_test.reshape(10000, N_INPUT_DIMS)    # 10000 x 784
X_train = X_train.astype('float32')         # change type
X_test = X_test.astype('float32')           # change type
# normalize
X_train /= 255  # 0-255 to 0-1
X_test /= 255   # 0-255 to 0-1

print(X_train.shape[0], 'train samples')    # number of rows
print(X_test.shape[0], 'test samples')      # number of rows

#convert class vectors to binary class matrices
Y_train = to_categorical(y_train, N_CLASSES)  # 10 outputs
Y_test = to_categorical(y_test, N_CLASSES)    # 10 outputs

model = Sequential()    # model with linear stack of layers

# input layer
model.add(Dense(N_HIDDEN, input_shape=(N_INPUT_DIMS,))) # 784 input features connected to each of the 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# hidden layer #1
model.add(Dense(N_HIDDEN))          # hidden layer with 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# hidden layer #2
model.add(Dense(N_HIDDEN))          # hidden layer with 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# TODO: add more hidden dense layers and dropouts

# output layer
model.add(Dense(N_CLASSES))         # output layer with 10 outputs (each possible digit)
model.add(Activation('softmax'))    # converts outputs into probabilities (confidences) for each class that sum to 1

''' optional model & layer creation
model = Sequential([
    Input(shape=(N_INPUT_DIMS,)),           # input layer with number of neurons
    Dense(N_HIDDEN, activation='relu'),     # input layer with number of neurons & relu activation
    # TODO: add more hidden dense layers and dropouts here
    Dense(N_CLASSES, activation='softmax')  # output layer with number of outputs (each possible digit) & softmax activation
])  
'''

# Evaluate model
model.summary()     # prints a summary representation of the model

model.compile(loss='categorical_crossentropy',  # loss function
              optimizer=OPTIMIZER,              # optimizer
              metrics=['accuracy'])             # metrics to be evaluated by the model during training and testing

history = model.fit(X_train, Y_train,                                   # training data
                    batch_size=BATCH_SIZE, epochs=N_EPOCHS,             # number of times the model is trained
                    verbose=VERBOSE, validation_split=VALIDATION_SPLIT) # how much TRAIN is reserved for VALIDATION

score = model.evaluate(X_test, Y_test, verbose=VERBOSE)     # test the model

print("Training accuracy:", round(history.history['accuracy'][-1],6))        # final training accuracy
print("Test accuracy:", round(score[1],6))         # test accuracy
print("Test score (loss):", round(score[0],6))     # loss on test
print("Validation accuracy:", round(history.history['val_accuracy'][-1],6))  # final validation accuracy

# Goal: high training and validation accuracy (which means good generalization)
# Overfitting: large gap between training and validation accuracy (try L1 regression to nullify unaffective nodes or L2 for colinearity
# Underfitting: low training and validation accuracy
#------------------------------------------------
# RESULTS (6 significant digits)
#------------------------------------------------
'''
    BASELINE:           2 hidden layers, 128 neurons, 20 epochs, 128 batch size, 0.2 validation split,
                        SGD optimizer, relu activation, categorical_crossentropy loss, 784 input shape, 10 outputs
    
    Training accuracy:  0.954104
    Test accuracy:      0.951700
    Test score (loss):  0.162484  # lower loss means higher accuracy
    Validation accuracy:0.954833  # indicator of generalization accuracy

    Experiments:        hyperparameter=BATCH_SIZE : original=128
    -------------------------------------------------------------
    EXP #1:             new=256
    Training accuracy:  0.932208
    Test accuracy:      0.9328
    Test score (loss):  0.229173
    Validation accuracy:0.935667
    
    EXP #2:             new=8
    Training accuracy:  0.999875
    Test accuracy:      0.9798
    Test score (loss):  0.086414
    Validation accuracy:0.97775
'''
pass    # omits outputting last unassigned string (in ''' block above)

60000 train samples
10000 test samples
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 128)               100480    
_________________________________________________________________
activation_33 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_34 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_35 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_36 (Dense

In [9]:
from __future__         import print_function
import numpy            as np
#import tensorflow       as tf
from keras.datasets     import mnist                                # image dataset of handwritten numbers
from keras.models       import Sequential                           # model type
from keras.layers       import Dense, Activation, Dropout, Input    #from keras.layers.core import Dense, Activation
from keras.optimizers   import SGD                                  # activation function
from keras.utils        import to_categorical                       # one-hot encoding of ground truth values
# NOTE: TensorFlow backend used by Keras

np.random.seed(1671)  # for reproducibility

# network & training hyperparameters
VERBOSE = 1         # 0: no output, 1: progress bar, 2: one line per epoch
N_CLASSES = 10      # number of outputs = number of digits
# TODO: increase parameter values to improve accuracy
N_EPOCHS = 20       # number of times the model is trained
BATCH_SIZE = 1      # number of samples per gradient update before updating the weights
N_HIDDEN = 128      # number of neurons in each hidden layer
OPTIMIZER = SGD()   # optimizer  TODO: try Adam or RMSprop
VALIDATION_SPLIT = 0.2  # how much TRAIN is reserved for VALIDATION
# TODO: DROPOUT = 0.3   # regularization to prevent overfitting

# data: shuffled & split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()  # returns training & test data as tuples
# X_train is 60000 rows (images) of 28x28 values (array) + labels ---> N_INPUT_DIMS in 60000 x 784
# X_test is 10000 rows (images) of 28x28 values (array) + labels ---> N_INPUT_DIMS in 10000 x 784
N_INPUT_DIMS = 784  # flattened 2D array to 1D array
X_train = X_train.reshape(60000, N_INPUT_DIMS)  # 60000 x 784
X_test = X_test.reshape(10000, N_INPUT_DIMS)    # 10000 x 784
X_train = X_train.astype('float32')         # change type
X_test = X_test.astype('float32')           # change type
# normalize
X_train /= 255  # 0-255 to 0-1
X_test /= 255   # 0-255 to 0-1

print(X_train.shape[0], 'train samples')    # number of rows
print(X_test.shape[0], 'test samples')      # number of rows

#convert class vectors to binary class matrices
Y_train = to_categorical(y_train, N_CLASSES)  # 10 outputs
Y_test = to_categorical(y_test, N_CLASSES)    # 10 outputs

model = Sequential()    # model with linear stack of layers

# input layer
model.add(Dense(N_HIDDEN, input_shape=(N_INPUT_DIMS,))) # 784 input features connected to each of the 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# hidden layer #1
model.add(Dense(N_HIDDEN))          # hidden layer with 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# hidden layer #2
model.add(Dense(N_HIDDEN))          # hidden layer with 128 neurons
model.add(Activation('relu'))       # passes positive numbers unchanged and negative ones as 0
# TODO: model.add(Dropout(DROPOUT))

# TODO: add more hidden dense layers and dropouts

# output layer
model.add(Dense(N_CLASSES))         # output layer with 10 outputs (each possible digit)
model.add(Activation('softmax'))    # converts outputs into probabilities (confidences) for each class that sum to 1

''' optional model & layer creation
model = Sequential([
    Input(shape=(N_INPUT_DIMS,)),           # input layer with number of neurons
    Dense(N_HIDDEN, activation='relu'),     # input layer with number of neurons & relu activation
    # TODO: add more hidden dense layers and dropouts here
    Dense(N_CLASSES, activation='softmax')  # output layer with number of outputs (each possible digit) & softmax activation
])  
'''

# Evaluate model
model.summary()     # prints a summary representation of the model

model.compile(loss='categorical_crossentropy',  # loss function
              optimizer=OPTIMIZER,              # optimizer
              metrics=['accuracy'])             # metrics to be evaluated by the model during training and testing

history = model.fit(X_train, Y_train,                                   # training data
                    batch_size=BATCH_SIZE, epochs=N_EPOCHS,             # number of times the model is trained
                    verbose=VERBOSE, validation_split=VALIDATION_SPLIT) # how much TRAIN is reserved for VALIDATION

score = model.evaluate(X_test, Y_test, verbose=VERBOSE)     # test the model

print("Training accuracy:", round(history.history['accuracy'][-1],6))        # final training accuracy
print("Test accuracy:", round(score[1],6))         # test accuracy
print("Test score (loss):", round(score[0],6))     # loss on test
print("Validation accuracy:", round(history.history['val_accuracy'][-1],6))  # final validation accuracy

# Goal: high training and validation accuracy (which means good generalization)
# Overfitting: large gap between training and validation accuracy (try L1 regression to nullify unaffective nodes or L2 for colinearity
# Underfitting: low training and validation accuracy
#------------------------------------------------
# RESULTS (6 significant digits)
#------------------------------------------------
'''
    BASELINE:           2 hidden layers, 128 neurons, 20 epochs, 128 batch size, 0.2 validation split,
                        SGD optimizer, relu activation, categorical_crossentropy loss, 784 input shape, 10 outputs
   
    Training accuracy:  0.954104
    Test accuracy:      0.951700
    Test score (loss):  0.162484  # lower loss means higher accuracy
    Validation accuracy:0.954833  # indicator of generalization accuracy

    Experiments:        hyperparameter=BATCH_SIZE : original=128
    -------------------------------------------------------------
    EXP #1:             new=256 (WORST)
    Training accuracy:  0.932208
    Test accuracy:      0.9328
    Test score (loss):  0.229173
    Validation accuracy:0.935667
    
    EXP #2:             new=8 (BEST)
    Training accuracy:  0.999875
    Test accuracy:      0.9798
    Test score (loss):  0.086414
    Validation accuracy:0.97775
    
    EXP #3:             new=1 (SLOWEST)
    Training accuracy:  0.996354
    Test accuracy:      0.9749
    Test score (loss):  0.127938
    Validation accuracy:0.97725
'''
pass    # omits outputting last unassigned string (in ''' block above)

60000 train samples
10000 test samples
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_41 (Dense)             (None, 128)               100480    
_________________________________________________________________
activation_41 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_42 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_42 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_43 (Dense)             (None, 128)               16512     
_________________________________________________________________
activation_43 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_44 (Dense

#--------------------------------------------------#
# RESULTS (6 significant digits)
#--------------------------------------------------#
    BASELINE:
        2 hidden layers, 128 neurons, 20 epochs, 128 batch size, 0.2 validation split,
        SGD optimizer, relu activation, categorical_crossentropy loss, 784 input shape, 10 outputs
   
        Training accuracy:  0.954104
        Test accuracy:      0.951700
        Test score (loss):  0.162484  # lower loss means higher accuracy
        Validation accuracy:0.954833  # indicator of generalization accuracy

    Experiments:        hyperparameter=BATCH_SIZE : original=128
    -------------------------------------------------------------
    EXP #1: new=256         (WORST)
        Training accuracy:  0.932208
        Test accuracy:      0.9328
        Test score (loss):  0.229173
        Validation accuracy:0.935667
    
    EXP #2: new=8           (BEST)
        Training accuracy:  0.999875
        Test accuracy:      0.9798
        Test score (loss):  0.086414
        Validation accuracy:0.97775
    
    EXP #3: new=1           (SLOWEST)
        Training accuracy:  0.996354
        Test accuracy:      0.9749
        Test score (loss):  0.127938
        Validation accuracy:0.97725
------------------------------------------------------------------
# CONCLUSION
------------------------------------------------------------------
After determining the baseline training & test results, I decided to experiment with the BATCH_SIZE hyperparameter, which is the number of samples per gradient update before updating the weights. When raising this parameter to 256 (twice the original value), the accuracy went down and the loss went up. When lowering this value to 1, the training took a substantially longer amount of time to complete, but offered impressive results. The best results came from a BATCH_SIZE of 8, which although still quite slow, offered the highest training, test, and validation accuracy, as well as the lowest loss score on the tests of 0.086515 - down from the original value of 0.162484! The model is inclined to be better at generalization than the original training permitted, as can be seen in the validation accuracy score of 0.97725 (opposed to 0.954833). Overall, it makes sense that lowering the BATCH_SIZE value will increase the frequency of weight updates, and thus take a longer amount of time to complete. But since the weights are being adjusted so frequently, the model is able to better adjust to the training, resulting in a higher overall accuracy rating.In conclusion, changing the hyperparameter BATCH_SIZE resulted in a model with greater accuracy and generalization skills.