In [1]:
# SAVE AND RESTORE MODELS:
# Model progress can be saved during and after training(weight saving)
# so that the model can resume where it left off

from __future__ import absolute_import, division, print_function, unicode_literals

# Library for running OS related tasks
import os

# Import TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras

tf.__version__

W0830 15:11:47.808974 140736020308864 __init__.py:690] 

  TensorFlow's `tf-nightly` package will soon be updated to TensorFlow 2.0.

  Please upgrade your code to TensorFlow 2.0:
    * https://www.tensorflow.org/beta/guide/migration_guide

  Or install the latest stable TensorFlow 1.X release:
    * `pip install -U "tensorflow==1.*"`

  Otherwise your code may be broken by the change.

  


'1.15.0-dev20190821'

In [2]:
### Load the data
# We'll use the MNIST dataset to demonstrate weight saving

(X_train, Y_train), (X_test, Y_test) = tf.keras.datasets.mnist.load_data()

In [3]:
# Keep only the first 1'000 example in the training/test set
# to speed-up learning

X_train = X_train[:10000]
Y_train = Y_train[:10000]

X_test = X_test[:10000]
Y_test = Y_test[:10000]

print('Training input set shape : ', X_train.shape)
print('Test set input shape : ', X_test.shape)
print('-'*30)
print('Training output set shape : ', Y_train.shape)
print('Validation output set shape : ', Y_test.shape)

Training input set shape :  (10000, 28, 28)
Test set input shape :  (10000, 28, 28)
------------------------------
Training output set shape :  (10000,)
Validation output set shape :  (10000,)


In [4]:
# Let's reshape the training/test input set so that
# a training example corresponds to a one dimensional vector
# Normalize the input vectors as well. 

X_train = X_train.reshape(-1, 28*28)/255.0
X_test = X_test.reshape(-1, 28*28)/255.0

print('Training input set shape : ', X_train.shape)
print('Test set input shape : ', X_test.shape)

Training input set shape :  (10000, 784)
Test set input shape :  (10000, 784)


In [5]:
# Let's define a simple model for illustration

def create_model():
    ''' Define a simple model and return it
        INPUT -> DENSE(512) -> RELU -> DROPOUT(0.2) -> DENSE(10) -> SOFTMAX.
        
        Algorithm optimizer is Adam, the loss function is the 
        sparse categorical cross entropy(1-hot vector). And the
        metric used is the accuracy.
        
        The model is not trained.
    '''
    
    model = keras.Sequential([
        keras.layers.Dense(512, activation=tf.nn.relu, input_shape=(784,)),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(10, activation=tf.nn.softmax)
    ])
    
    model.compile(optimizer='adam',
                  loss=keras.losses.sparse_categorical_crossentropy,
                  metrics=['accuracy'])
    
    return model

In [6]:
# Instanciate a simple model

model = create_model()
model.summary()

W0830 15:11:48.583710 140736020308864 deprecation.py:506] From /Users/nicolas/anaconda/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               401920    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                5130      
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
_________________________________________________________________


In [10]:
### Save checkpoints during training
# So that you know need to re-train it later on

# Define the path and directory of the checkpoint (file to save)

checkpoint_path = 'training_1/cp.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path) # Return the directory name of checkpoint_path

# Create checkpoint callback
ckpt_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                   save_weigth_only = True,
                                                   verbose=1)

# Create a simple model
model = create_model()

# Fit the training set, including the checkpoint callback
model.fit(X_train,
          Y_train,
          epochs = 10,
          validation_data = (X_test, Y_test),
          callbacks = [ckpt_callback])

# The weights are stored at the end of each epoch

Train on 10000 samples, validate on 10000 samples
Epoch 1/10
Epoch 00001: saving model to training_1/cp.ckpt
Epoch 2/10
Epoch 00002: saving model to training_1/cp.ckpt
Epoch 3/10
Epoch 00003: saving model to training_1/cp.ckpt
Epoch 4/10
Epoch 00004: saving model to training_1/cp.ckpt
Epoch 5/10
Epoch 00005: saving model to training_1/cp.ckpt
Epoch 6/10
Epoch 00006: saving model to training_1/cp.ckpt
Epoch 7/10
Epoch 00007: saving model to training_1/cp.ckpt
Epoch 8/10
Epoch 00008: saving model to training_1/cp.ckpt
Epoch 9/10
Epoch 00009: saving model to training_1/cp.ckpt
Epoch 10/10
Epoch 00010: saving model to training_1/cp.ckpt


<tensorflow.python.keras.callbacks.History at 0x13ccc9e10>

In [18]:
# Let's compare the performance of a new untrained model to
# a new model with the previously stored weigths

# Untrained model
model = create_model()

loss_untrained, accuracy_untrained = model.evaluate(X_test, Y_test)
print('Untrained model accuracy: ', accuracy_untrained)
# As expected accuracy is about 10%

# Restored model
model.load_weights(checkpoint_path)
loss_restored, accuracy_restored = model.evaluate(X_test, Y_test)
print('Restored model accuracy: ', accuracy_restored)

Untrained model accuracy:  0.0717
Restored model accuracy:  0.9601


In [22]:
### Checkpoint callback options

checkpoint_path = 'training_2/cp-{epoch:04d}.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a checkpoint to save weights every 5-epochs
ckpt_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                          save_weights_only = True,
                                          verbose = 1,
                                          period = 5)

# Create a new model
model = create_model()

# Save the weight of the untrained model
model.save_weights(checkpoint_path.format(epoch=0))

# Train the model
model.fit(X_train,
          Y_train,
          epochs = 50,
          validation_data = (X_test, Y_test),
          callbacks = [ckpt_callback],
          verbose=0)

W0830 15:43:37.161931 140736020308864 callbacks.py:864] `period` argument is deprecated. Please use `save_freq` to specify the frequency in number of samples seen.



Epoch 00005: saving model to training_2/cp-0005.ckpt

Epoch 00010: saving model to training_2/cp-0010.ckpt

Epoch 00015: saving model to training_2/cp-0015.ckpt

Epoch 00020: saving model to training_2/cp-0020.ckpt

Epoch 00025: saving model to training_2/cp-0025.ckpt

Epoch 00030: saving model to training_2/cp-0030.ckpt

Epoch 00035: saving model to training_2/cp-0035.ckpt

Epoch 00040: saving model to training_2/cp-0040.ckpt

Epoch 00045: saving model to training_2/cp-0045.ckpt

Epoch 00050: saving model to training_2/cp-0050.ckpt


<tensorflow.python.keras.callbacks.History at 0x12f20b860>

In [23]:
ls {checkpoint_dir}

checkpoint                        cp-0025.ckpt.data-00001-of-00002
cp-0000.ckpt.data-00000-of-00002  cp-0025.ckpt.index
cp-0000.ckpt.data-00001-of-00002  cp-0030.ckpt.data-00000-of-00002
cp-0000.ckpt.index                cp-0030.ckpt.data-00001-of-00002
cp-0005.ckpt.data-00000-of-00002  cp-0030.ckpt.index
cp-0005.ckpt.data-00001-of-00002  cp-0035.ckpt.data-00000-of-00002
cp-0005.ckpt.index                cp-0035.ckpt.data-00001-of-00002
cp-0010.ckpt.data-00000-of-00002  cp-0035.ckpt.index
cp-0010.ckpt.data-00001-of-00002  cp-0040.ckpt.data-00000-of-00002
cp-0010.ckpt.index                cp-0040.ckpt.data-00001-of-00002
cp-0015.ckpt.data-00000-of-00002  cp-0040.ckpt.index
cp-0015.ckpt.data-00001-of-00002  cp-0045.ckpt.data-00000-of-00002
cp-0015.ckpt.index                cp-0045.ckpt.data-00001-of-00002
cp-0020.ckpt.data-00000-of-00002  cp-0045.ckpt.index
cp-0020.ckpt.data-00001-of-00002  cp-0050.ckpt.data-00000-of-00002
cp-0020.ckpt.index                cp-0050.ckpt.dat

In [24]:
# Get the lastest weights values from the checkpoint directory 

latest_weights = tf.train.latest_checkpoint(checkpoint_dir)
latest_weights

'training_2/cp-0050.ckpt'

In [26]:
# Let's create a new model and restore the latest weights

model = create_model()
model.load_weights(latest_weights)

loss, accuracy = model.evaluate(X_test, Y_test)
print('Restored model accuracy: ', accuracy)

Restored model accuracy:  0.9627


In [27]:
### We can also save the weights manually instead of using a callback function

# Save the weights
model.save_weights('./checkpoints/manual_checkpoint')

# Restore the weights in a new model
model = create_model()
model.load_weights('./checkpoints/manual_checkpoint')

loss, accuracy = model.evaluate(X_test, Y_test)
print('Restored model accuracy: ', accuracy)

# As expected, same accuracy as the model in the above cell

Restored model accuracy:  0.9627


In [28]:
### Save the entire model (not just the weights)
# Allows us to restore a model without having access to the original code

# A model can be stored either in a HDF5 file or as a saved_model

In [29]:
## As a HDF5 file

model = create_model()

model.fit(X_train,
          Y_train,
          epochs = 5)

# Save the model
model.save('my_model.h5')

Train on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
# Let's recreate a model from that file

new_model = keras.models.load_model('my_model.h5')

# Check its architecture and accuracy
new_model.summary()
loss, accuracy = new_model.evaluate(X_test, Y_test)
print('Restored model accuracy: ', accuracy)

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 512)               401920    
_________________________________________________________________
dropout_13 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_27 (Dense)             (None, 10)                5130      
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
_________________________________________________________________
Restored model accuracy:  0.9524
