# Pratical Example: `california_housing`

## Normal equation (from linear regression)

In [1]:
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
housing = fetch_california_housing()
m, n = housing.data.shape
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]

In [2]:
X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape((-1,1)), dtype=tf.float32, name="y")
XT = tf.transpose(X)
theta_other = tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), tf.matmul(XT, y))
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

with tf.Session() as sess:
    theta_value, theta_other_value = sess.run((theta, theta_other))

In [4]:
theta_value

array([[-3.62395554e+01],
       [ 4.38410223e-01],
       [ 9.56030935e-03],
       [-1.09072715e-01],
       [ 6.50818944e-01],
       [-3.61849015e-06],
       [-3.80502851e-03],
       [-4.14003015e-01],
       [-4.26401258e-01]], dtype=float32)

In [5]:
theta_other_value

array([[-3.62382812e+01],
       [ 4.38407898e-01],
       [ 9.55867767e-03],
       [-1.09069824e-01],
       [ 6.50848389e-01],
       [-3.61911952e-06],
       [-3.80516052e-03],
       [-4.14001465e-01],
       [-4.26452637e-01]], dtype=float32)

In [6]:
np.array_equal(theta_other_value, theta_value)

False

In [7]:
theta_other_value - theta_value

array([[ 1.2741089e-03],
       [-2.3245811e-06],
       [-1.6316772e-06],
       [ 2.8908253e-06],
       [ 2.9444695e-05],
       [-6.2937033e-10],
       [-1.3201497e-07],
       [ 1.5497208e-06],
       [-5.1379204e-05]], dtype=float32)

In [8]:
np.allclose(theta_other_value, theta_value)

False

In [9]:
np.allclose(theta_other_value, theta_value)

False

In [10]:
np.allclose(theta_other_value, theta_value, rtol=1e-3, atol=1e-2)

True

In [11]:
np.allclose(theta_other_value, theta_value, rtol=1e-3, atol=1e-3)

True

In [37]:
np.allclose(theta_other_value, theta_value, rtol=1e-2, atol=1e-3)

True

In [38]:
tf.test.is_gpu_available()

True

### `np.array_equal`

In [39]:
np.array_equiv(theta_other_value, theta_value)

False

In [40]:
help(np.array_equal)

Help on function array_equal in module numpy:

array_equal(a1, a2, equal_nan=False)
    True if two arrays have the same shape and elements, False otherwise.
    
    Parameters
    ----------
    a1, a2 : array_like
        Input arrays.
    equal_nan : bool
        Whether to compare NaN's as equal. If the dtype of a1 and a2 is
        complex, values will be considered equal if either the real or the
        imaginary component of a given value is ``nan``.
    
        .. versionadded:: 1.19.0
    
    Returns
    -------
    b : bool
        Returns True if the arrays are equal.
    
    See Also
    --------
    allclose: Returns True if two arrays are element-wise equal within a
              tolerance.
    array_equiv: Returns True if input arrays are shape consistent and all
                 elements equal.
    
    Examples
    --------
    >>> np.array_equal([1, 2], [1, 2])
    True
    >>> np.array_equal(np.array([1, 2]), np.array([1, 2]))
    True
    >>> np.array_equal([1,

In [41]:
help(np.array_equiv)

Help on function array_equiv in module numpy:

array_equiv(a1, a2)
    Returns True if input arrays are shape consistent and all elements equal.
    
    Shape consistent means they are either the same shape, or one input array
    can be broadcasted to create the same shape as the other one.
    
    Parameters
    ----------
    a1, a2 : array_like
        Input arrays.
    
    Returns
    -------
    out : bool
        True if equivalent, False otherwise.
    
    Examples
    --------
    >>> np.array_equiv([1, 2], [1, 2])
    True
    >>> np.array_equiv([1, 2], [1, 3])
    False
    
    Showing the shape equivalence:
    
    >>> np.array_equiv([1, 2], [[1, 2], [1, 2]])
    True
    >>> np.array_equiv([1, 2], [[1, 2, 1, 2], [1, 2, 1, 2]])
    False
    
    >>> np.array_equiv([1, 2], [[1, 2], [1, 3]])
    False



In [42]:
help(np.allclose)

Help on function allclose in module numpy:

allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False)
    Returns True if two arrays are element-wise equal within a tolerance.
    
    The tolerance values are positive, typically very small numbers.  The
    relative difference (`rtol` * abs(`b`)) and the absolute difference
    `atol` are added together to compare against the absolute difference
    between `a` and `b`.
    
    NaNs are treated as equal if they are in the same place and if
    ``equal_nan=True``.  Infs are treated as equal if they are in the same
    place and of the same sign in both arrays.
    
    Parameters
    ----------
    a, b : array_like
        Input arrays to compare.
    rtol : float
        The relative tolerance parameter (see Notes).
    atol : float
        The absolute tolerance parameter (see Notes).
    equal_nan : bool
        Whether to compare NaN's as equal.  If True, NaN's in `a` will be
        considered equal to NaN's in `b` in the output a

## Gradient descent manually

### w/o normalizing the data
**(?1)** Do we need `tf.reset_default_graph()` to avoid mixing together what we are about to do with what we just did?<br>

In [43]:
n_epochs = 1000
learning_rate = 0.01

X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        sess.run(training_op)
        if epoch % 100 == 0:
            #print(f"(epoch {epoch:04d}) MSE = {sess.run(mse):.4f}")
            print(f"(epoch {epoch:04d}) MSE = {mse.eval():.4f}")
    
    final_theta = theta.eval()

(epoch 0000) MSE = 2855737982189568.0000
(epoch 0100) MSE = nan
(epoch 0200) MSE = nan
(epoch 0300) MSE = nan
(epoch 0400) MSE = nan
(epoch 0500) MSE = nan
(epoch 0600) MSE = nan
(epoch 0700) MSE = nan
(epoch 0800) MSE = nan
(epoch 0900) MSE = nan


**(?2)** The MSE is not decreasing. What's happened?<br>
Is it because we should have done a `tf.reset_default_graph()`?

In [44]:
tf.reset_default_graph()

In [45]:
n_epochs = 1000
learning_rate = 0.01

X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        sess.run(training_op)
        if epoch % 100 == 0:
            #print(f"(epoch {epoch:04d}) MSE = {sess.run(mse):.4f}")
            print(f"(epoch {epoch:04d}) MSE = {mse.eval():.4f}")
    
    final_theta = theta.eval()


(epoch 0000) MSE = 4466536207089664.0000
(epoch 0100) MSE = nan
(epoch 0200) MSE = nan
(epoch 0300) MSE = nan
(epoch 0400) MSE = nan
(epoch 0500) MSE = nan
(epoch 0600) MSE = nan
(epoch 0700) MSE = nan
(epoch 0800) MSE = nan
(epoch 0900) MSE = nan


I've double-checked: There seems to be no typo in the code, at least no significant diff from the code in the book. Was it due to lack of normalization?

### w/ data normalized
- `sklearn`
- `tf`
- `np`

In [46]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_housing_data = scaler.fit_transform(housing.data)
scaled_housing_data_plus_bias = np.c_[np.ones((m, 1)), scaled_housing_data]

In [47]:
n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        sess.run(training_op)
        if epoch % 100 == 0:
            #print(f"(epoch {epoch:04d}) MSE = {sess.run(mse):.4f}")
            print(f"(epoch {epoch:04d}) MSE = {mse.eval():.4f}")
    
    final_theta = theta.eval()

(epoch 0000) MSE = 6.1480
(epoch 0100) MSE = 0.8191
(epoch 0200) MSE = 0.6455
(epoch 0300) MSE = 0.6074
(epoch 0400) MSE = 0.5841
(epoch 0500) MSE = 0.5675
(epoch 0600) MSE = 0.5555
(epoch 0700) MSE = 0.5469
(epoch 0800) MSE = 0.5406
(epoch 0900) MSE = 0.5361


**(R2)** Yes, it seems that normalizing the data in this case is of crucial importance.

**(?3)** In the book (p.237 on top) it says that training w/o first normalizing the data only slows down the training. Let's train the unnormalized data with a prolonged `n_epochs`, say `10_000`, to see if that is true. 

In [48]:
tf.reset_default_graph()

n_epochs = 10_000
learning_rate = 0.01

X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        sess.run(training_op)
        if epoch % 100 == 0:
            #print(f"(epoch {epoch:04d}) MSE = {sess.run(mse):.4f}")
            print(f"(epoch {epoch:04d}) MSE = {mse.eval():.4f}")
    
    final_theta = theta.eval()


(epoch 0000) MSE = 1150065271898112.0000
(epoch 0100) MSE = nan
(epoch 0200) MSE = nan
(epoch 0300) MSE = nan
(epoch 0400) MSE = nan
(epoch 0500) MSE = nan
(epoch 0600) MSE = nan
(epoch 0700) MSE = nan
(epoch 0800) MSE = nan
(epoch 0900) MSE = nan
(epoch 1000) MSE = nan
(epoch 1100) MSE = nan
(epoch 1200) MSE = nan
(epoch 1300) MSE = nan
(epoch 1400) MSE = nan
(epoch 1500) MSE = nan
(epoch 1600) MSE = nan
(epoch 1700) MSE = nan
(epoch 1800) MSE = nan
(epoch 1900) MSE = nan
(epoch 2000) MSE = nan
(epoch 2100) MSE = nan
(epoch 2200) MSE = nan
(epoch 2300) MSE = nan
(epoch 2400) MSE = nan
(epoch 2500) MSE = nan
(epoch 2600) MSE = nan
(epoch 2700) MSE = nan
(epoch 2800) MSE = nan
(epoch 2900) MSE = nan
(epoch 3000) MSE = nan
(epoch 3100) MSE = nan
(epoch 3200) MSE = nan
(epoch 3300) MSE = nan
(epoch 3400) MSE = nan
(epoch 3500) MSE = nan
(epoch 3600) MSE = nan
(epoch 3700) MSE = nan
(epoch 3800) MSE = nan
(epoch 3900) MSE = nan
(epoch 4000) MSE = nan
(epoch 4100) MSE = nan
(epoch 4200) MSE

I even tried `n_epochs = 100_000`, we still get `MSE = nan` right after the first iteration. **(R3)** It seems that no normalization not only slows down the training, but sometimes it won't lead to a solution.

**(?4)** This is really intriguing. Had we used `MinMaxScaler` instead of `StandardScaler`, would convergence
at least take place?

In [12]:
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
minmax_scaled_housing_data = minmax_scaler.fit_transform(housing.data)
minmax_scaled_housing_data_plus_bias = np.c_[np.ones((m, 1)), minmax_scaled_housing_data]

In [15]:
tf.reset_default_graph()

In [16]:
n_epochs = 5000
learning_rate = 0.01

X = tf.constant(minmax_scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        sess.run(training_op)
        if epoch % 100 == 0:
            #print(f"(epoch {epoch:04d}) MSE = {sess.run(mse):.4f}")
            print(f"(epoch {epoch:04d}) MSE = {mse.eval():.4f}")
    
    final_theta = theta.eval()

(epoch 0000) MSE = 6.8732
(epoch 0100) MSE = 1.2738
(epoch 0200) MSE = 1.2239
(epoch 0300) MSE = 1.1833
(epoch 0400) MSE = 1.1459
(epoch 0500) MSE = 1.1114
(epoch 0600) MSE = 1.0795
(epoch 0700) MSE = 1.0497
(epoch 0800) MSE = 1.0221
(epoch 0900) MSE = 0.9962
(epoch 1000) MSE = 0.9721
(epoch 1100) MSE = 0.9496
(epoch 1200) MSE = 0.9285
(epoch 1300) MSE = 0.9088
(epoch 1400) MSE = 0.8903
(epoch 1500) MSE = 0.8730
(epoch 1600) MSE = 0.8568
(epoch 1700) MSE = 0.8416
(epoch 1800) MSE = 0.8274
(epoch 1900) MSE = 0.8140
(epoch 2000) MSE = 0.8014
(epoch 2100) MSE = 0.7897
(epoch 2200) MSE = 0.7786
(epoch 2300) MSE = 0.7683
(epoch 2400) MSE = 0.7585
(epoch 2500) MSE = 0.7494
(epoch 2600) MSE = 0.7408
(epoch 2700) MSE = 0.7327
(epoch 2800) MSE = 0.7251
(epoch 2900) MSE = 0.7180
(epoch 3000) MSE = 0.7113
(epoch 3100) MSE = 0.7050
(epoch 3200) MSE = 0.6990
(epoch 3300) MSE = 0.6934
(epoch 3400) MSE = 0.6881
(epoch 3500) MSE = 0.6832
(epoch 3600) MSE = 0.6785
(epoch 3700) MSE = 0.6741
(epoch 3800)

**(R4)** It seems that `MinMaxScaler` also allows convergence, but the convergence is much slower, compared
to using `StandardScaler`.

Don't forget that we still need to verify the validity of the formula `gradients = 2/m * tf.matmul(tf.transpose(X), error)`

- `theta.shape` equals `(n+1, 1)`
  - For `gradients` to be correct, at least `gradients.shape` should be **the same**: `error.shape` equals `(m, 1)`, `X.shape` being `(m, n+1)` $\implies$ `tf.matmul(tf.transpose(X), error)` equal to `(n+1, 1)`.
- It's already quite lengthy, this notebook. Let's edit the derivation of the formula in a separate pdf file named `grad_mse.pdf`.

### Autodiff
With only one line modified from the code of manual computing `gradients`, we can utilize autodiff:

BTW, I just came up with a way to see the effect of `tf.reset_default_graph()`:

In [49]:
X.graph is tf.get_default_graph()

True

In [50]:
tf.get_default_graph().get_all_collection_keys()

['trainable_variables', 'variables']

In [52]:
tf.get_default_graph().get_operations()

[<tf.Operation 'X' type=Const>,
 <tf.Operation 'y' type=Const>,
 <tf.Operation 'random_uniform/shape' type=Const>,
 <tf.Operation 'random_uniform/min' type=Const>,
 <tf.Operation 'random_uniform/max' type=Const>,
 <tf.Operation 'random_uniform/RandomUniform' type=RandomUniform>,
 <tf.Operation 'random_uniform/sub' type=Sub>,
 <tf.Operation 'random_uniform/mul' type=Mul>,
 <tf.Operation 'random_uniform' type=Add>,
 <tf.Operation 'theta' type=VariableV2>,
 <tf.Operation 'theta/Assign' type=Assign>,
 <tf.Operation 'theta/read' type=Identity>,
 <tf.Operation 'predictions' type=MatMul>,
 <tf.Operation 'sub' type=Sub>,
 <tf.Operation 'Square' type=Square>,
 <tf.Operation 'Const' type=Const>,
 <tf.Operation 'mse' type=Mean>,
 <tf.Operation 'transpose/perm' type=Const>,
 <tf.Operation 'transpose' type=Transpose>,
 <tf.Operation 'MatMul' type=MatMul>,
 <tf.Operation 'mul/x' type=Const>,
 <tf.Operation 'mul' type=Mul>,
 <tf.Operation 'mul_1/x' type=Const>,
 <tf.Operation 'mul_1' type=Mul>,
 <tf.

In [54]:
tf.get_default_graph().get_collection("variables")

[<tf.Variable 'theta:0' shape=(9, 1) dtype=float32_ref>]

In [55]:
tf.get_default_graph().get_collection("trainable_variables")

[<tf.Variable 'theta:0' shape=(9, 1) dtype=float32_ref>]

Now we reset the default graph and see what's left:

In [56]:
tf.reset_default_graph()
X.graph is tf.get_default_graph()

False

In [57]:
tf.get_default_graph().get_all_collection_keys()

[]

In [58]:
tf.get_default_graph().get_operations()

[]

Ok, after diverting a little bit to `tf.reset_default_graph()`, we turn back to autodiff.

In [60]:
tf.reset_default_graph()
n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
#gradients = 2/m * tf.matmul(tf.transpose(X), error)
gradients = tf.gradients(mse, [theta])[0]
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        sess.run(training_op)
        if epoch % 100 == 0:
            #print(f"(epoch {epoch:04d}) MSE = {sess.run(mse):.4f}")
            print(f"(epoch {epoch:04d}) MSE = {mse.eval():.4f}")
    
    final_theta = theta.eval()

(epoch 0000) MSE = 11.5359
(epoch 0100) MSE = 0.8343
(epoch 0200) MSE = 0.6230
(epoch 0300) MSE = 0.5904
(epoch 0400) MSE = 0.5718
(epoch 0500) MSE = 0.5586
(epoch 0600) MSE = 0.5491
(epoch 0700) MSE = 0.5423
(epoch 0800) MSE = 0.5373
(epoch 0900) MSE = 0.5337


The `gradients()` function takes an op (in this case `mse`) and a list of variables (in this case just `theta`), and it creates **a list of ops** (**one per variable**) to compute the gradients of the op with respect to each variable. Indeed, in the code above, the list of variables is just `[theta]`, a list of only one variable, so at the end of the line we take the 0-th element of the returned gradients `[0]`, which is the gradient of `mse` with respect to `theta`.

### Optimizers, an even more convenient alternative to `tf.gradients()`
Replace two lines:
- `gradients = ...`
- `training_op = ...`

In [68]:
tf.reset_default_graph()
n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
#gradients = 2/m * tf.matmul(tf.transpose(X), error)
#gradients = tf.gradients(mse, [theta])[0]
#training_op = tf.assign(theta, theta - learning_rate * gradients)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        sess.run(training_op)
        if epoch % 100 == 0:
            #print(f"(epoch {epoch:04d}) MSE = {sess.run(mse):.4f}")
            print(f"(epoch {epoch:04d}) MSE = {mse.eval():.4f}")
    
    final_theta = theta.eval()

(epoch 0000) MSE = 7.1328
(epoch 0100) MSE = 0.6748
(epoch 0200) MSE = 0.5836
(epoch 0300) MSE = 0.5664
(epoch 0400) MSE = 0.5550
(epoch 0500) MSE = 0.5467
(epoch 0600) MSE = 0.5407
(epoch 0700) MSE = 0.5363
(epoch 0800) MSE = 0.5331
(epoch 0900) MSE = 0.5308


One thing also worth noticing is that the cells like the one above **gives a quite different numerical result at each execution**.

We have a number of diff choices for the optimizer, e.g. we could have chosen `MomentumOptimizer` instead.

In [76]:
tf.reset_default_graph()
n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
#gradients = 2/m * tf.matmul(tf.transpose(X), error)
#gradients = tf.gradients(mse, [theta])[0]
#training_op = tf.assign(theta, theta - learning_rate * gradients)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        sess.run(training_op)
        if epoch % 100 == 0:
            #print(f"(epoch {epoch:04d}) MSE = {sess.run(mse):.4f}")
            print(f"(epoch {epoch:04d}) MSE = {mse.eval():.4f}")
    
    final_theta = theta.eval()

(epoch 0000) MSE = 6.3637
(epoch 0100) MSE = 0.5366
(epoch 0200) MSE = 0.5247
(epoch 0300) MSE = 0.5243
(epoch 0400) MSE = 0.5243
(epoch 0500) MSE = 0.5243
(epoch 0600) MSE = 0.5243
(epoch 0700) MSE = 0.5243
(epoch 0800) MSE = 0.5243
(epoch 0900) MSE = 0.5243


If you tried executing above cell (`MomentumOptimizer`) several times, you've probably noticed that it seems to be the only one so far the gives a stable converging minimum of `0.5243`, which seemingly proves the common saying that `MomentumOptimizer` converges faster.