In [5]:
import numpy as np
import tensorflow as tf
import keras
import warnings

warnings.filterwarnings(action='ignore')

In [None]:
### Batch Normalization

In [9]:
from tensorflow.contrib.layers import batch_norm

<em> Instead of using the batch_norm() function as a regularizer parameter to the fully_connected() function, we now use batch_normalization() and we explicitly create a distinct layer

In [5]:
tf.reset_default_graph()

n_inputs = 28*28 
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

learning_rate = 0.01

X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(dtype=tf.int32, shape=(None), name='y')
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("dnn"):
    
    he_init = tf.variance_scaling_initializer()
    
    hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1')
    bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
    bn1_act = tf.nn.elu(bn1)

    hidden2 = tf.layers.dense(bn1_act, n_hidden2, name='hidden2')
    bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
    bn2_act = tf.nn.elu(bn2)

    logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name='outputs')
    logits = tf.layers.batch_normalization(logits_before_bn, training=training, momentum=0.9)
    
with tf.name_scope('loss'):
    
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
with tf.name_scope('train'):
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate, name='GradientDescent')
    training_op = optimizer.minimize(loss)
    
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, k=1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy')
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [56]:
# Writing log_directories for tensorboard 

from datetime import datetime

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

cost_summary = tf.summary.scalar('Cross_entropy_cost', tensor=loss)
fileWriter = tf.summary.FileWriter(logdir=logdir, graph=tf.get_default_graph())

In [27]:
# Prepairing the data

from scipy.io import loadmat

mnist_org = loadmat('mnist-original.mat')
data = mnist_org['data'].T
targets = mnist_org['label'].T
X_train, X_test, y_train, y_test = data[:60000], data[60000:], targets[:60000], targets[60000:]

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
y_train, y_test = y_train.ravel(), y_test.ravel()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [26]:
# Another way to write the next_batch() function 

def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    
    for batch_idx in np.array_split(rnd_idx, n_batches): ## All the batches are here
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch ## Yield returns items one by one 

<em>we need to explicitly run the extra update operations needed by batch normalization 

In [65]:
n_epochs = 20
batch_size = 200

extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run([training_op, extra_update_ops],
                     feed_dict={training: True, X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Validation accuracy:", accuracy_val)
        
    save_path = saver.save(sess, "./my_model_final.ckpt")

0 Validation accuracy: 0.8741
1 Validation accuracy: 0.8932
2 Validation accuracy: 0.9045
3 Validation accuracy: 0.9123
4 Validation accuracy: 0.9202
5 Validation accuracy: 0.926
6 Validation accuracy: 0.9301
7 Validation accuracy: 0.9348
8 Validation accuracy: 0.9374
9 Validation accuracy: 0.9414
10 Validation accuracy: 0.9436
11 Validation accuracy: 0.9467
12 Validation accuracy: 0.9489
13 Validation accuracy: 0.9509
14 Validation accuracy: 0.953
15 Validation accuracy: 0.9531
16 Validation accuracy: 0.9555
17 Validation accuracy: 0.9571
18 Validation accuracy: 0.9579
19 Validation accuracy: 0.9588


<em>

What!? That's not a great accuracy for MNIST. Of course, if you train for longer it will get much better accuracy, but with such a shallow network, Batch Norm and ELU are unlikely to have very positive impact: they shine mostly for much deeper nets.


## Reusing Pretrained Layers

<em>
- If the original model was trained using TensorFlow, you can simply restore it and
train it on the new task:by saver.restore as usual

- First you need to load the graph's structure. The import_meta_graph() function does just that, loading the graph's operations into the default graph, and returning a Saver that you can then use to restore the model's state. Note that by default, a Saver saves the structure of the graph into a .meta file, so that's the file you should load:

In [49]:
tf.reset_default_graph()
saver = tf.train.import_meta_graph("./my_model_final.ckpt.meta")

In [42]:
# List all the ops 

for op in tf.get_default_graph().get_operations():
    print(op.name)

Once you know which operations you need, you can get a handle on them using the graph's get_operation_by_name() or get_tensor_by_name() methods:


In [45]:
graph = tf.get_default_graph()

X = graph.get_tensor_by_name('X:0')
y = graph.get_tensor_by_name('y:0')

accuracy = tf.get_default_graph().get_tensor_by_name("eval/accuracy:0")

In [60]:
## Restoring the whole model and running again 

tf.reset_default_graph()

In [66]:
with tf.Session() as sess:
    saver.restore(sess, "./my_model_final.ckpt")

    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Validation accuracy:", accuracy_val)

INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt
0 Validation accuracy: 0.9587
1 Validation accuracy: 0.961
2 Validation accuracy: 0.9614
3 Validation accuracy: 0.9638
4 Validation accuracy: 0.9633
5 Validation accuracy: 0.9651
6 Validation accuracy: 0.965
7 Validation accuracy: 0.9654
8 Validation accuracy: 0.9663
9 Validation accuracy: 0.966
10 Validation accuracy: 0.9661
11 Validation accuracy: 0.9672
12 Validation accuracy: 0.9667
13 Validation accuracy: 0.9676
14 Validation accuracy: 0.9676
15 Validation accuracy: 0.9679
16 Validation accuracy: 0.9679
17 Validation accuracy: 0.969
18 Validation accuracy: 0.9682
19 Validation accuracy: 0.9683


### Reusing only part of the grpah 

Create a new DNN with 3 hidden layers 

In [22]:
tf.reset_default_graph()

n_inputs = 28 * 28  
n_hidden1 = 300 
n_hidden2 = 50  
n_hidden3 = 50  
n_outputs = 10 

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name='hidden1')
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2") 
    hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3") 
    logits = tf.layers.dense(hidden3, n_outputs, name="outputs")                         

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [23]:
n_epochs = 10
batch_size = 200


with tf.Session() as sess:
    init.run()
    
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Validation accuracy:", accuracy_val)
        
    save_path = saver.save(sess, "./my_model_final.ckpt")

0 Validation accuracy: 0.9477
1 Validation accuracy: 0.9555
2 Validation accuracy: 0.9616
3 Validation accuracy: 0.9639
4 Validation accuracy: 0.9587
5 Validation accuracy: 0.9597
6 Validation accuracy: 0.9608
7 Validation accuracy: 0.9623
8 Validation accuracy: 0.9633
9 Validation accuracy: 0.9666


<br><em>

In general you will want to reuse only the lower layers. If you are using import_meta_graph() it will load the whole graph, but you can simply ignore the parts you do not need. In this example, we add a new 4th hidden layer on top of the pretrained 3rd layer (ignoring the old 4th hidden layer). We also build a new output layer, the loss for this new output, and a new optimizer to minimize it. We also need another saver to save the whole graph (containing both the entire old graph plus the new operations), and an initialization operation to initialize all the new variables:


In [99]:
tf.reset_default_graph()

n_hidden4 = 20 # new layer
n_outputs = 10 # new layer

saver = tf.train.import_meta_graph("./my_model_final.ckpt.meta")

graph = tf.get_default_graph()

X = graph.get_tensor_by_name('X:0')
y = graph.get_tensor_by_name('y:0')

hidden3 = graph.get_tensor_by_name('dnn/hidden3/Relu:0') # Only need to import up until hidden layer3 

new_hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name='new_hidden4')
new_logits = tf.layers.dense(new_hidden4, n_outputs, name='outputs')

with tf.name_scope("new_loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=new_logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("new_eval"):
    correct = tf.nn.in_top_k(new_logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

#### Freezing layers

It is generally a good idea to “freeze” lower level weights
when training the new DNN: if the lower-layer weights are fixed, then the higherlayer
weights will be easier to train.
To freeze the lower layers during training, the simplest solution is to give the optimizer
the list of variables to train, excluding the variables from the lower layers:

In [100]:
with tf.name_scope("new_train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="hidden[34]|outputs") # regular expression
    training_op = optimizer.minimize(loss, var_list=train_vars)

In [101]:
reuse_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden[123]")
restore_saver = tf.train.Saver(reuse_variables) # to restore layers 1-3

init = tf.global_variables_initializer()
new_saver = tf.train.Saver()

In [102]:
with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "./my_model_final.ckpt")
    
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Validation accuracy:", accuracy_val)
        
    save_path = new_saver.save(sess, "./my_model_final.ckpt")

INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt
0 Validation accuracy: 0.9007
1 Validation accuracy: 0.9205
2 Validation accuracy: 0.9273
3 Validation accuracy: 0.9331
4 Validation accuracy: 0.9357
5 Validation accuracy: 0.9369
6 Validation accuracy: 0.9382
7 Validation accuracy: 0.94
8 Validation accuracy: 0.941
9 Validation accuracy: 0.9421


### Learning Rate Scheduling

In [10]:
tf.reset_default_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')

with tf.name_scope('dnn'):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name='hidden1')
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name='hidden2')
    logits = tf.layers.dense(hidden2, n_outputs, name='outputs')
    
with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name='loss')
    
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(predictions=logits, targets=y, k=1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy')

#### Exponential scheduling

The distinction between trainable variables and non-trainable variables is used to let Optimizers know which variables they can act upon. When defining a tf.Variable(), setting trainable=True (the default) automatically adds the variable to the GraphKeys.TRAINABLE_VARIABLES collection. During training, an optimizer gets the content of that collection via tf.trainable_variables() and applies the training to all of them.

In [36]:
with tf.name_scope('train'):
    initial_learning_rate = 0.01 
    decay_step = 10000
    decay_rate = 1/10
    global_step = tf.Variable(0, trainable=False, name='global_step') #keep track of the current training iteration number.
    learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step, decay_step, decay_rate)
    
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)
    training_op = optimizer.minimize(loss, global_step=global_step) # global_step variable, will kindly take care of incrementing

In [37]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [38]:
n_epochs = 5
batch_size = 50

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Validation accuracy:", accuracy_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")

0 Validation accuracy: 0.9523
1 Validation accuracy: 0.966
2 Validation accuracy: 0.9685
3 Validation accuracy: 0.9708
4 Validation accuracy: 0.9726


<em>AdaGrad, RMSProp, and Adam optimization automatically reduce the learning
rate during training, it is not necessary to add an extra learning schedule. For other
optimization algorithms, using exponential decay or performance scheduling can
considerably speed up convergence.

## Avoiding Overfitting Through Regularization


#### $\ell_1$ and $\ell_2$ regularization

Adding l1 and l2 manually 

In [9]:
tf.reset_default_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')

with tf.name_scope('dnn'):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name='hidden1')
    logits = tf.layers.dense(hidden1, n_outputs, name='outputs')

<em>

Next, we get a handle on the layer weights, and we compute the total loss, which is equal to the sum of the usual cross entropy loss and the $\ell_1$ loss (i.e., the absolute values of the weights):


In [49]:
W1 = tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
W2 = tf.get_default_graph().get_tensor_by_name("outputs/kernel:0")

scale = 0.001 # l1 regularization hyperparameter

with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    base_loss = tf.reduce_mean(xentropy, name='avg_xentropy')
    reg_losses = tf.reduce_sum(tf.abs(W1)) + tf.reduce_sum(tf.abs(W2))
    loss = tf.add(base_loss, scale * reg_losses, name='loss') 

In [50]:
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [51]:
n_epochs = 20
batch_size = 200

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Validation accuracy:", accuracy_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")

0 Validation accuracy: 0.8841
1 Validation accuracy: 0.9078
2 Validation accuracy: 0.9174
3 Validation accuracy: 0.9241
4 Validation accuracy: 0.9271
5 Validation accuracy: 0.93
6 Validation accuracy: 0.9322
7 Validation accuracy: 0.934
8 Validation accuracy: 0.935
9 Validation accuracy: 0.9364
10 Validation accuracy: 0.9371
11 Validation accuracy: 0.9371
12 Validation accuracy: 0.9361
13 Validation accuracy: 0.9367
14 Validation accuracy: 0.9366
15 Validation accuracy: 0.9362
16 Validation accuracy: 0.9356
17 Validation accuracy: 0.9357
18 Validation accuracy: 0.9352
19 Validation accuracy: 0.9354


<em>

Alternatively, we can pass a regularization function to the tf.layers.dense() function, which will use it to create operations that will compute the regularization loss, and it adds these operations to the collection of regularization losses. The beginning is the same as above:


In [56]:
# use Python's partial() function to avoid repeating the same arguments over and over again

tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')

from functools import partial

my_dense_layer = partial(tf.layers.dense, activation='relu', kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))

with tf.name_scope("dnn"):
    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    hidden2 = my_dense_layer(hidden1, n_hidden2, name="hidden2")
    logits = my_dense_layer(hidden2, n_outputs, activation=None, name="outputs")

Next we must add the regularization losses to the base loss:

In [60]:
with tf.name_scope("loss"):                                   
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)      
    base_loss = tf.reduce_mean(xentropy, name="avg_xentropy")   
    reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss = tf.add_n([base_loss] + reg_loss, name='loss')

## Dropout

<em>To implement dropout using TensorFlow, you can simply apply the dropout() function
to the input layer and to the output of every hidden layer.

In [30]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

training = tf.placeholder_with_default(input=False, shape=(), name='training')

dropout_rate = 0.5 # == 1 - keep_prob
X_drop = tf.layers.dropout(X, rate=dropout_rate, training=training)

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=training)
    hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=tf.nn.relu, name="hidden2")
    hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=training)
    logits = tf.layers.dense(hidden2_drop, n_outputs, name='outputs')

In [32]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
    training_op = optimizer.minimize(loss)    

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [68]:
n_epochs = 20
batch_size = 50

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch, training: True}) ## Set training is true here. Deafult testing will be false 
        accuracy_val = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, "Validation accuracy:", accuracy_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")

0 Validation accuracy: 0.9142
1 Validation accuracy: 0.9217
2 Validation accuracy: 0.9326
3 Validation accuracy: 0.9411
4 Validation accuracy: 0.9384
5 Validation accuracy: 0.9448
6 Validation accuracy: 0.9442
7 Validation accuracy: 0.9454
8 Validation accuracy: 0.9466
9 Validation accuracy: 0.9494
10 Validation accuracy: 0.949
11 Validation accuracy: 0.9516
12 Validation accuracy: 0.9497
13 Validation accuracy: 0.9517
14 Validation accuracy: 0.9466
15 Validation accuracy: 0.952
16 Validation accuracy: 0.9533
17 Validation accuracy: 0.9514
18 Validation accuracy: 0.952
19 Validation accuracy: 0.9563


<em>
- If you observe that the model is overfitting, you can increase the dropout rate (i.e.,
reduce the keep_prob hyperparameter). Conversely, you should try decreasing the
dropout rate (i.e., increasing keep_prob) if the model underfits the training set. It can
also help to increase the dropout rate for large layers, and reduce it for small ones.

- Dropout does tend to significantly slow down convergence, but it usually results in a
much better model when tuned properly. So, it is generally well worth the extra time
and effort

### Max Norm Regularization

<em>

Let's get a handle on the first hidden layer's weight and create an operation that will compute the clipped weights using the clip_by_norm() function. Then we create an assignment operation to assign the clipped weights to the weights variable:


In [20]:
threshold = 1.0 
graph = tf.get_default_graph()

weights = graph.get_tensor_by_name("hidden1/kernel:0")
clipped_weights = tf.clip_by_norm(weights, clip_norm=threshold, axes=1)
clip_weights = tf.assign(weights, clipped_weights)

In [22]:
# Doing this for the second hidden layer as well 

weights2 = graph.get_tensor_by_name("hidden2/kernel:0")
clipped_weights2 = tf.clip_by_norm(weights2, clip_norm=threshold, axes=1)
clip_weights2 = tf.assign(weights2, clipped_weights2)

In [24]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 20
batch_size = 50


<em>
And now we can train the model. It's pretty much as usual, except that right after running the training_op, we run the clip_weights and clip_weights2 operations:


In [34]:
with tf.Session() as sess:
    init.run()
    
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
            clip_weights.eval()
            clip_weights2.eval()
        acc_valid = accuracy.eval(feed_dict={X: X_test, y: y_test})   
        print(epoch, "Validation accuracy:", acc_valid)                 