# Data

In [1]:
import time
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Import
housing = fetch_california_housing()
x_train_full, x_test, y_train_full, y_test = train_test_split(housing.data, housing.target, test_size=0.3)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_full, y_train_full, test_size=0.3)

# PreProcessing
scaler = MinMaxScaler((-1,1))
x_train = scaler.fit_transform(x_train).astype(np.float32)
x_valid = scaler.transform(x_valid).astype(np.float32)
x_test = scaler.transform(x_test).astype(np.float32)

y_train = np.expand_dims(y_train, axis=1)
y_valid = np.expand_dims(y_valid, axis=1)
y_test = np.expand_dims(y_test, axis=1)

# Summary
print(f"Training Set: {len(x_train)} \
      \nValidation Set: {len(x_valid)} \
      \nTest Set: {len(x_test)}")

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


Training Set: 10113       
Validation Set: 4335       
Test Set: 6192


In [2]:
x_train.dtype

dtype('float32')

# Keras in TensorFlow 2.0

In [3]:
%tensorflow_version 2.x
import tensorflow as tf
print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))

TensorFlow 2.x selected.
TensorFlow version: 2.1.0
Eager execution: True


**Multi-Layer Perceptron using Keras Sequential API**

In [4]:
keras_mlp = tf.keras.Sequential([tf.keras.layers.Dense(10, "tanh"),
                                 tf.keras.layers.Dense(1)])

keras_mlp.compile("SGD", "MSE")
keras_mlp.fit(x_train, y_train, batch_size=32, epochs=50,
              validation_data=(x_valid, y_valid))

Train on 10113 samples, validate on 4335 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f35701472b0>

In [5]:
pred = keras_mlp.predict(x_test)
print(f"MSE: {np.mean((y_test - pred)**2)}")

MSE: 0.6043500663931235


**Using Keras Subclassing API**

In [0]:
class MLP(tf.keras.Model):
  def __init__(self):
    super(MLP, self).__init__()
    self.dense1 = tf.keras.layers.Dense(10, activation="tanh")
    self.dense2 = tf.keras.layers.Dense(1)

    self.get_loss = tf.keras.losses.MeanSquaredError()
    self.optimizer = tf.keras.optimizers.SGD()

  def call(self, x):
    x = self.dense1(x)
    return self.dense2(x)

  def fit(self, x, y, batch_size=32, epochs=1, validation_data=None):
    for epoch in range(epochs):
      print(f"Epoch {epoch+1}/{epochs}")
      training_loss = self.get_loss(y, self(x))

      for i_batch in range(0, len(x), batch_size):
        x_batch = x[i_batch:i_batch+batch_size]
        y_batch = y[i_batch:i_batch+batch_size]

        with tf.GradientTape() as tape:
          y_batch_pred = self(x_batch, training=True)
          loss = self.get_loss(y_batch, y_batch_pred)
        
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

      x_valid, y_valid = validation_data
      validation_loss = self.get_loss(y_valid, self(x_valid))
      print(f"loss: {training_loss:.4f} - val_loss: {validation_loss:.4f}")

  def predict(self, x):
    return self(x).numpy()

In [7]:
mlp = MLP()
mlp.fit(x_train, y_train, batch_size=32, epochs=50,
        validation_data=(x_valid, y_valid))

Epoch 1/50
loss: 2.9277 - val_loss: 0.7578
Epoch 2/50
loss: 0.7980 - val_loss: 0.6760
Epoch 3/50
loss: 0.7157 - val_loss: 0.6394
Epoch 4/50
loss: 0.6782 - val_loss: 0.6111
Epoch 5/50
loss: 0.6491 - val_loss: 0.5908
Epoch 6/50
loss: 0.6283 - val_loss: 0.5758
Epoch 7/50
loss: 0.6131 - val_loss: 0.5645
Epoch 8/50
loss: 0.6016 - val_loss: 0.5556
Epoch 9/50
loss: 0.5927 - val_loss: 0.5484
Epoch 10/50
loss: 0.5856 - val_loss: 0.5425
Epoch 11/50
loss: 0.5798 - val_loss: 0.5375
Epoch 12/50
loss: 0.5749 - val_loss: 0.5333
Epoch 13/50
loss: 0.5707 - val_loss: 0.5296
Epoch 14/50
loss: 0.5671 - val_loss: 0.5263
Epoch 15/50
loss: 0.5639 - val_loss: 0.5233
Epoch 16/50
loss: 0.5610 - val_loss: 0.5207
Epoch 17/50
loss: 0.5583 - val_loss: 0.5182
Epoch 18/50
loss: 0.5559 - val_loss: 0.5160
Epoch 19/50
loss: 0.5536 - val_loss: 0.5139
Epoch 20/50
loss: 0.5516 - val_loss: 0.5120
Epoch 21/50
loss: 0.5496 - val_loss: 0.5102
Epoch 22/50
loss: 0.5478 - val_loss: 0.5085
Epoch 23/50
loss: 0.5460 - val_loss: 0.50

In [8]:
pred = mlp.predict(x_test)
print(f"MSE: {np.mean((y_test - pred)**2)}")

MSE: 0.5274821604295353


## Experimenting with Tensors - Calculating the Jacobian

In [9]:
# Basic gradient calculation: passing loss to GradientTape

l = 5

with tf.GradientTape() as tape:
  pred = mlp(x_train[:l])
  true = y_train[:l]
  loss = mlp.get_loss(true, pred)

tape.gradient(loss, mlp.trainable_variables) # dLoss/dW

[<tf.Tensor: shape=(8, 10), dtype=float32, numpy=
 array([[ 0.01813006,  0.01315879, -0.05390307, -0.07105918, -0.05422998,
          0.02295678,  0.0037895 , -0.02438062, -0.00324793,  0.00589163],
        [ 0.01732049, -0.00313817, -0.014398  ,  0.02074726,  0.00193153,
          0.00289756,  0.0172822 ,  0.00069206,  0.01752797,  0.00883744],
        [ 0.03873325,  0.02318771, -0.11454947, -0.18653479, -0.11511849,
          0.05541396,  0.02424902, -0.05034576,  0.0042984 ,  0.01615881],
        [ 0.04070631,  0.02398363, -0.11995587, -0.19645564, -0.12033454,
          0.05828572,  0.02631934, -0.05254586,  0.00522891,  0.0171713 ],
        [ 0.04356614,  0.02382907, -0.12586296, -0.21002516, -0.12510887,
          0.06221614,  0.0317113 , -0.05432252,  0.00880596,  0.01919838],
        [ 0.04269958,  0.02497934, -0.1256443 , -0.20614247, -0.12589677,
          0.06113686,  0.02798224, -0.05491682,  0.00581591,  0.01809839],
        [-0.03252755, -0.00396332,  0.07560576,  0.16229

In [10]:
# Do we get the Jacobian Matrix if we pass the vector of residuals to the GradientTape?

l = 5

with tf.GradientTape() as tape:
  pred = mlp(x_train[:l])
  true = y_train[:l]
  residuals = true - pred

tape.gradient(residuals, mlp.trainable_variables) # No, this is the sum of the gradients w.r.t. each residual; Proven in next cell

[<tf.Tensor: shape=(8, 10), dtype=float32, numpy=
 array([[ 0.7399844 ,  0.18966728, -1.8542768 , -3.5953655 , -1.7115018 ,
          1.0168358 ,  0.96086246, -0.7149834 ,  0.52475536,  0.4118924 ],
        [-0.4002539 , -0.1440517 ,  1.161946  ,  3.021617  ,  1.2696704 ,
         -0.86525655, -0.6260009 ,  0.54300934, -0.28235042, -0.2787351 ],
        [ 1.3583237 ,  0.36420467, -3.6173606 , -7.2596197 , -3.3582501 ,
          1.8875772 ,  1.8708192 , -1.3863604 ,  0.9915237 ,  0.7402282 ],
        [ 1.3969249 ,  0.37599456, -3.729312  , -7.4880342 , -3.462803  ,
          1.9417635 ,  1.9260306 , -1.4287806 ,  1.0192653 ,  0.76032305],
        [ 1.322643  ,  0.36490512, -3.5605912 , -7.159792  , -3.3150172 ,
          1.8483253 ,  1.820721  , -1.3670158 ,  0.95566475,  0.7174972 ],
        [ 1.444142  ,  0.3900773 , -3.858585  , -7.7465506 , -3.5839286 ,
          2.0092795 ,  1.9896747 , -1.478468  ,  1.0519518 ,  0.7859582 ],
        [ 0.57238454,  0.04169988, -1.3295074 , -2.44905

In [11]:
# Hardcoded Iterative Reproduction of Previous Cell

l = 5

with tf.GradientTape(persistent=True) as tape:
  pred = mlp(x_train[:l])
  true = y_train[:l]

  r_0 = (true - pred)[0]
  r_1 = (true - pred)[1]
  r_2 = (true - pred)[2]
  r_3 = (true - pred)[3]
  r_4 = (true - pred)[4]

j_0 = tape.gradient(r_0, mlp.trainable_variables)
j_1 = tape.gradient(r_1, mlp.trainable_variables)
j_2 = tape.gradient(r_2, mlp.trainable_variables)
j_3 = tape.gradient(r_3, mlp.trainable_variables)
j_4 = tape.gradient(r_4, mlp.trainable_variables)

cumul = j_0
for i in range(4):
  cumul[i] = tf.add(cumul[i], j_1[i])
for i in range(4):
  cumul[i] = tf.add(cumul[i], j_2[i])
for i in range(4):
  cumul[i] = tf.add(cumul[i], j_3[i])
for i in range(4):
  cumul[i] = tf.add(cumul[i], j_4[i])

cumul # Derivative with respect to each element in residuals is summed!

[<tf.Tensor: shape=(8, 10), dtype=float32, numpy=
 array([[ 0.7399844 ,  0.18966728, -1.8542768 , -3.5953655 , -1.7115017 ,
          1.0168358 ,  0.96086246, -0.7149834 ,  0.52475536,  0.4118924 ],
        [-0.4002539 , -0.1440517 ,  1.161946  ,  3.021617  ,  1.2696704 ,
         -0.86525667, -0.6260009 ,  0.5430093 , -0.28235042, -0.2787351 ],
        [ 1.3583237 ,  0.36420467, -3.6173606 , -7.2596197 , -3.3582501 ,
          1.8875773 ,  1.8708192 , -1.3863604 ,  0.99152374,  0.7402282 ],
        [ 1.3969249 ,  0.37599456, -3.729312  , -7.4880342 , -3.462803  ,
          1.9417635 ,  1.9260306 , -1.4287806 ,  1.0192654 ,  0.76032317],
        [ 1.322643  ,  0.36490512, -3.5605912 , -7.159792  , -3.3150175 ,
          1.8483251 ,  1.820721  , -1.3670157 ,  0.9556648 ,  0.71749717],
        [ 1.444142  ,  0.3900773 , -3.858585  , -7.7465506 , -3.5839286 ,
          2.0092795 ,  1.9896748 , -1.478468  ,  1.0519518 ,  0.7859582 ],
        [ 0.57238454,  0.04169988, -1.3295074 , -2.44905

In [12]:
# Is this the way to do it?

l = 5

with tf.GradientTape() as tape:
  pred = mlp(x_train[:l])
  true = y_train[:l]
  residuals = true - pred

tape.jacobian(residuals, mlp.trainable_variables) # Yes

[<tf.Tensor: shape=(5, 1, 8, 10), dtype=float32, numpy=
 array([[[[ 0.27391604,  0.02496682, -0.5824292 , -1.1448259 ,
           -0.49469948,  0.3496558 ,  0.40663522, -0.20211115,
            0.25831714,  0.16928168],
          [-0.24626158, -0.02244618,  0.5236274 ,  1.0292449 ,
            0.44475484, -0.31435472, -0.36558154,  0.18170612,
           -0.23223756, -0.15219107],
          [ 0.3488161 ,  0.03179379, -0.7416896 , -1.457869  ,
           -0.62997097,  0.4452663 ,  0.51782626, -0.2573768 ,
            0.32895184,  0.21557035],
          [ 0.35050938,  0.03194813, -0.74529004, -1.4649462 ,
           -0.63302904,  0.44742778,  0.52033997, -0.25862616,
            0.33054867,  0.2166168 ],
          [ 0.3011263 ,  0.02744698, -0.64028656, -1.2585508 ,
           -0.54384196,  0.38438994,  0.4470296 , -0.22218849,
            0.28397787,  0.18609779],
          [ 0.35991696,  0.03280561, -0.7652934 , -1.5042648 ,
           -0.6500194 ,  0.45943663,  0.53430575, -0.26556763