In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -U tqdm
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "deep"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [239]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import Perceptron
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from functools import partial

def get_run_logdir(prefix=""):
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S" + prefix)
    return os.path.join("/home/infodba/Documents/study/hands_on_ml2/my_logs/", run_id)

test_x = np.arange(10)
test_y = np.zeros((10,1))

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[1]),
    keras.layers.Dense(300, kernel_initializer="he_normal"),
    keras.layers.Dense(1)
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(lr=1e-3),
              metrics=["accuracy"])

history = model.fit(test_x, test_y, epochs=1)

Train on 10 samples


# Using TensorFlow like NumPy

### Tensors and Operations

In [8]:
t1 = tf.constant([[1., 2., 3.], [4., 5., 6.]]) # matrix
t2 = tf.constant(42) # scalar
t1, t2

(<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
 array([[1., 2., 3.],
        [4., 5., 6.]], dtype=float32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=42>)

In [11]:
t1.shape, t1.dtype

(TensorShape([2, 3]), tf.float32)

In [13]:
t1[:, 1:]

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[2., 3.],
       [5., 6.]], dtype=float32)>

In [20]:
t1[:, 1, tf.newaxis]

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[2.],
       [5.]], dtype=float32)>

In [49]:
t3 = t1+10 #equivalent tf.add(t1, 10)
t4 = tf.square(t1)
t5 = tf.transpose(t1)
t6 = t1 @ t5 #equivalent np.matmul()

In [55]:
tf.math.reduce_sum(t1), tf.reduce_sum(t1)

(<tf.Tensor: shape=(), dtype=float32, numpy=21.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=21.0>)

In [30]:
t3 = tf.transpose(t1)
t3

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[1., 4.],
       [2., 5.],
       [3., 6.]], dtype=float32)>

In [34]:
t4 = t @ t3
t4

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[14., 32.],
       [32., 77.]], dtype=float32)>

In [38]:
t5 = tf.matmul(tf.transpose(t3), t3)

In [39]:
t5

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[14., 32.],
       [32., 77.]], dtype=float32)>

In [59]:
a = tf.constant([[1,2,3],[4,5,6]], tf.int32)
b = tf.constant([2,2], tf.int32)
tf.tile(a, b)

<tf.Tensor: shape=(4, 6), dtype=int32, numpy=
array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6],
       [1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]], dtype=int32)>

Пример вызова low level API из keras

In [60]:
from tensorflow import keras
K = keras.backend

K.square(K.transpose(t1)) + 10

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[11., 26.],
       [14., 35.],
       [19., 46.]], dtype=float32)>

### Tensors and Numpy

In [75]:
tf.constant(np.arange(10, dtype=np.float16))

<tf.Tensor: shape=(10,), dtype=float16, numpy=array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], dtype=float16)>

In [87]:
tf.range(10, dtype=tf.float64).numpy()

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [83]:
tf.square(np.arange(10))

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81])>

In [84]:
np.square(tf.range(10))

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81], dtype=int32)

#### Type Conversions

In [93]:
tf.constant(2.) + tf.constant(40)
tf.constant(2.) + tf.constant(40, dtype=tf.float64)

InvalidArgumentError: cannot compute AddV2 as input #1(zero-based) was expected to be a float tensor but is a int32 tensor [Op:AddV2] name: add/

In [98]:
t1 = tf.constant(2.) 
t2 = tf.constant(40, dtype=tf.float64)
tf.cast(t2, dtype=tf.float32)

<tf.Tensor: shape=(), dtype=float32, numpy=40.0>

#### Variables

In [6]:
v1 = tf.Variable([[1.,2.,3.], [4.,5.,6.]])
v1

<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)>

In [7]:
v1.assign(2*v1)

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2.,  4.,  6.],
       [ 8., 10., 12.]], dtype=float32)>

In [8]:
v1[0,1].assign(42)

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2., 42.,  6.],
       [ 8., 10., 12.]], dtype=float32)>

In [115]:
v1[:,2].assign([0., 1.])

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2., 42.,  0.],
       [ 8., 10.,  1.]], dtype=float32)>

In [120]:
v1.scatter_nd_update(indices=[[0,0], [1,2]], updates=[100., 200.])

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[100.,  42.,   0.],
       [  8.,  10., 200.]], dtype=float32)>

# Customizing Models and Training Algorithms

### Custom loss

In [3]:
def huber_fn(y_true, y_pred):
    error = y_true - y_pred
    is_small_error = tf.abs(error) < 1
    squared_loss = tf.square(error) / 2
    linear_loss = tf.abs(error) - 0.5
    return tf.where(is_small_error, squared_loss, linear_loss)

In [141]:
model.compile(loss=huber_fn, optimizer="nadam")
model.fit(test_x, test_y, epochs=1)

Train on 10 samples


<tensorflow.python.keras.callbacks.History at 0x7fbf9879d358>

In [142]:
model.save("my_model.h5")

### Saving and Loading Models That Contain Custom Components

In [143]:
model = keras.models.load_model("my_model.h5", custom_objects={"huber_fn": huber_fn})

Если необходимо реализовать функцию потерь с аргументами, то при показанном выше подходе аргументы не будут сохраняться. Чтобы сохранять аргументы функции потерь необходимо делать ее реализацию на базе класса keras.losses.Loss (см. 377).  

### Custom Activation Functions, Initializers, Regularizers, and Constraints

In [293]:
def my_softplus(z): #line tf.nn.softplus()
    return tf.math.log(tf.exp(z) + 1.0)

def my_glorot_initializer(shape, dtype=tf.float32):
    stddev = tf.sqrt(2. / (shape[0] + shape[1]))
    return tf.random.normal(shape, stddev=stddev, dtype=dtype)

def my_l1_regularizer(weights):
    return tf.reduce_sum(tf.abs(0.01*weights))
    
def my_positive_weights(weights): #is as tf.nn.relu()
    return tf.where(weights < 0., tf.zeros_like(weights),  weights)

In [179]:
layer = keras.layers.Dense(activation=my_softplus, units=30,
                          kernel_initializer=my_l1_regularizer, 
                          kernel_regularizer=my_l1_regularizer,
                          kernel_constraint=my_positive_weights)

### Custom Metrics 

In [194]:
model.compile(loss="mse", optimizer="nadam", metrics=[huber_fn])
model.fit(test_x, test_y, epochs=2)

Train on 10 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fbfa0c52668>

Особый случай с бинарной классификацией (см. на стр. 381).

Например, 1й проход дал 5 позитивных предсказания из которых 1 ложно позитивный, итоговая точность 4/5 = 80%; 2й проход дал 3 ложно положительных, итоговая точность 0%. Т.о. средняя точность получается 40%, но это не верно, т.к. (4+0)/(5+3) = 50%.  Для решения задачи используется keras.metrics.Precision 

In [197]:
precision = keras.metrics.Precision()

#pass 1 batch of predictions and lables for binary classification task
precision([0, 1, 1, 1, 0, 1, 0, 1], [1, 1, 0, 1, 0, 1, 0, 1])

<tf.Tensor: shape=(), dtype=float32, numpy=0.8>

In [200]:
#pass 2 (emulation the next pass)
precision([0, 1, 0, 0, 1, 0, 1, 1], [1, 0, 1, 1, 0, 0, 0, 0])

<tf.Tensor: shape=(), dtype=float32, numpy=0.5>

In [201]:
precision.result()

<tf.Tensor: shape=(), dtype=float32, numpy=0.5>

In [203]:
precision.variables

[<tf.Variable 'true_positives:0' shape=(1,) dtype=float32, numpy=array([4.], dtype=float32)>,
 <tf.Variable 'false_positives:0' shape=(1,) dtype=float32, numpy=array([4.], dtype=float32)>]

In [204]:
precision.reset_states()

In [205]:
precision.variables

[<tf.Variable 'true_positives:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>,
 <tf.Variable 'false_positives:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]

Пример собственной реализации потоковой метрики

In [4]:
class HuberMetric(keras.metrics.Metric):
    def __init__(self, threshold=1.0, **kwargs):
        super().__init__(**kwargs) # handles base args (e.g., dtype)
        self.threshold = threshold
        self.huber_fn = huber_fn
        self.total = self.add_weight("total", initializer="zeros")
        self.count = self.add_weight("count", initializer="zeros")
    def update_state(self, y_true, y_pred, sample_weight=None):
        #metric = self.huber_fn(y_true, y_pred)
        metric = y_true - y_pred
        self.total.assign_add(tf.reduce_sum(metric))
        self.count.assign_add(tf.cast(tf.size(y_true), tf.float32))
        print(self.total)
        print(self.count)
    def result(self):
        return self.total / self.count
    def get_config(self): #used to save threshold while model is saving 
        base_config = super().get_config()
        return {**base_config, "threshold": self.threshold}

In [282]:
test_x = tf.range(10, dtype=tf.float32)
test_y = tf.zeros(10)

model.compile(loss="mse", optimizer="nadam", metrics=[HuberMetric(2.0)])
model.fit(test_x, test_y, epochs=2, batch_size=1)

Train on 10 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fbfa0052c50>

### Custom layers 

In [18]:
exp_layer = keras.layers.Lambda(lambda x: tf.exp(x))

In [22]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=[1]),
    keras.layers.Dense(1),
    exp_layer
])
model.compile(loss="mse", optimizer="nadam")
model.fit(test_x, test_y, epochs=1)

Train on 10 samples


<tensorflow.python.keras.callbacks.History at 0x7fb588127128>

In [143]:
class MyDense(keras.layers.Layer):
    def __init__(self, units, activation=None, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.activation = keras.activations.get(activation)

    def build(self, batch_input_shape):
        #print("in build() method")
        #print("batch_input_shape:", batch_input_shape)
        #print(batch_input_shape.as_list()[:-1])
        self.kernel = self.add_weight(
            name="kernel", shape=[batch_input_shape[-1], self.units],
            initializer="glorot_normal")
        self.bias = self.add_weight(
            name="bias", shape=[self.units], initializer="zeros")
        super().build(batch_input_shape) # must be at the end

    def call(self, X):
        #print("in call() method")
        return self.activation(X @ self.kernel + self.bias)

    def compute_output_shape(self, batch_input_shape):
        a = tf.TensorShape(batch_input_shape.as_list()[:-1] + [self.units])
        print("compute_output_shape() = ", a)
        return tf.TensorShape(batch_input_shape.as_list()[:-1] + [self.units])

    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "units": self.units,
                "activation": keras.activations.serialize(self.activation)}

In [144]:
keras.activations.get("relu") #test, just interesting to look up

<function tensorflow.python.keras.activations.relu(x, alpha=0.0, max_value=None, threshold=0)>

In [148]:
model = keras.models.Sequential([
    MyDense(10, activation="relu", input_shape=[10]),
])

Если необходимо обеспечить различное поведение слоев в процессе обучения и тестирования (полезно в случаях, например, Dropout или BN), то  

In [149]:
class AddGaussianNoise(keras.layers.Layer):
    def __init__(self, stddev, **kwargs):
        super().__init__(**kwargs)
        self.stddev = stddev

    def call(self, X, training=None):
        if training:
            noise = tf.random.normal(tf.shape(X), stddev=self.stddev)
            return X + noise
        else: return X

    def compute_output_shape(self, batch_input_shape):
        return batch_input_shape

### Custom models

In [150]:
class ResidualBlock(keras.layers.Layer):
    def __init__(self, n_layers, n_neurons, **kwargs):
        super().__init__(**kwargs)
        self.n_layers = n_layers                                   
        self.n_neurons = n_neurons                                  
        self.hidden = [keras.layers.Dense(n_neurons, activation="elu",
                                          kernel_initializer="he_normal")
                       for _ in range(n_layers)]

    def call(self, inputs):
        Z = inputs
        for layer in self.hidden:
            Z = layer(Z)
        return inputs + Z
    
    def get_config(self):                                               
        base_config = super().get_config()                              
        return {**base_config,                                          
                "n_layers": self.n_layers, "n_neurons": self.n_neurons}

In [151]:
class ResidualRegressor(keras.models.Model):
    def __init__(self, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.output_dim = output_dim                                 
        self.hidden1 = keras.layers.Dense(30, activation="elu",
                                          kernel_initializer="he_normal")
        self.block1 = ResidualBlock(2, 30)
        self.block2 = ResidualBlock(2, 30)
        self.out = keras.layers.Dense(output_dim)

    def call(self, inputs):
        Z = self.hidden1(inputs)
        for _ in range(1 + 3):
            Z = self.block1(Z)
        Z = self.block2(Z)
        return self.out(Z)

    def get_config(self):                                           
        base_config = super().get_config()                           
        return {**base_config,                                     
                "output_dim": self.output_dim}                      

### Losses and Metrics Based on Model Internals

In [182]:
class ReconstructingRegressor(keras.models.Model):
    def __init__(self, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.hidden = [keras.layers.Dense(30, activation="selu",
                                          kernel_initializer="lecun_normal")
                       for _ in range(5)]
        self.out = keras.layers.Dense(output_dim)
        # TODO: check https://github.com/tensorflow/tensorflow/issues/26260
        #self.reconstruction_mean = keras.metrics.Mean(name="reconstruction_error")

    def build(self, batch_input_shape):
        n_inputs = batch_input_shape[-1]
        self.reconstruct = keras.layers.Dense(n_inputs)
        super().build(batch_input_shape)

    def call(self, inputs, training=None):
        Z = inputs
        for layer in self.hidden:
            Z = layer(Z)
        reconstruction = self.reconstruct(Z)
        recon_loss = tf.reduce_mean(tf.square(reconstruction - inputs))
        self.add_loss(0.05 * recon_loss)
        #if training:
        #    result = self.reconstruction_mean(recon_loss)
        #    self.add_metric(result)
        return self.out(Z)

### Computing Gradients Using Autodiff

In [186]:
def f(w1, w2):
    return 3*w1**2 + 2*w1*w2

def print_f_derivative(w1, w2):
    w1_dir = 6*w1 + 2*w2
    w2_dir = 2*w1
    print("w1 partial derivative:", w1_dir)
    print("w2 partial derivative:", w2_dir)

In [200]:
w1, w2 = 10,3
print_f_derivative(w1, w2)

w1 partial derivative: 66
w2 partial derivative: 20


In [191]:
eps = 1e-6
w1_dir = (f(w1 + eps, w2) - f(w1, w2)) / eps
w2_dir = (f(w1, w2 + eps) - f(w1, w2)) / eps

print("w1 partial derivative:", w1_dir)
print("w2 partial derivative:", w2_dir)

w1 partial derivative: 36.000003007075065
w2 partial derivative: 10.000000003174137


In [220]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape() as tape:
    z = f(w1, w2)

gradients = tape.gradient(z, [w1, w2])
#gradients2 = tape.gradient(z, [w1, w2]) #RuntimeError here

In [221]:
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

approach if you want to call gradient() twice or more.

In [222]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape(persistent=True) as tape:
    z = f(w1, w2)

dw1 = tape.gradient(z, w1)
dw2 = tape.gradient(z, w2)
del tape

In [223]:
dw1, dw2

(<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>)

Tape object traks only tf.Variable. It doesn't work for another types like tf.constant for example

In [225]:
c1, c2 = tf.constant(5.), tf.constant(3.)
with tf.GradientTape() as tape:
    z = f(c1, c2)
tape.gradient(z, [c1, c2])

[None, None]

but it has an approach for this

In [227]:
c1, c2 = tf.constant(5.), tf.constant(3.)
with tf.GradientTape() as tape:
    tape.watch(c1)
    tape.watch(c2)
    z = f(c1, c2)
    
tape.gradient(z, [c1, c2])

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

compute the gradient of a list of tensors

In [230]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape() as tape:
    z1 = f(w1, w2)
    z2 = f(w1, w2)
    z3 = f(w1, w2)

gradients = tape.gradient([z1, z2, z3], [w1, w2])
#gradients2 = tape.gradient(z, [w1, w2]) #RuntimeError here

In [231]:
gradients #contains the sum of gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=108.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=30.0>]

if we want to compute gradients separately we mast execute gradients() for each tensor and use persistent tape

In [235]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape(persistent=True) as tape:
    z1 = f(w1, w2)
    z2 = f(w1, w2)
    z3 = f(w1, w2)

gw1 = tape.gradient(z1, [w1, w2])
gw2 = tape.gradient(z2, [w1, w2])
gw3 = tape.gradient(z3, [w1, w2])
del tape

In [236]:
gw1, gw2, gw3

([<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
  <tf.Tensor: shape=(), dtype=float32, numpy=10.0>],
 [<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
  <tf.Tensor: shape=(), dtype=float32, numpy=10.0>],
 [<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
  <tf.Tensor: shape=(), dtype=float32, numpy=10.0>])

example of partial defivatives of the partial derivatives

In [252]:
with tf.GradientTape(persistent=True) as hessian_tape:
    with tf.GradientTape() as jacobian_tape:
        z = f(w1, w2)
    jacobians = jacobian_tape.gradient(z, [w1, w2])
hessians = [hessian_tape.gradient(jacobian, [w1, w2])
           for jacobian in jacobians]
del hessian_tape

In [264]:
print("dz_dw1_dw1:", hessians[0][0])
print("dz_dw1_dw2:", hessians[0][1])
print("dz_dw2_dw1:", hessians[1][0])
print("dz_dw2_dw2:", hessians[1][1])

dz_dw1_dw1: tf.Tensor(6.0, shape=(), dtype=float32)
dz_dw1_dw2: tf.Tensor(2.0, shape=(), dtype=float32)
dz_dw2_dw1: tf.Tensor(2.0, shape=(), dtype=float32)
dz_dw2_dw2: None


Если необходимо не вычислять градиенты для части нейросети

In [269]:
def f2(w1, w2):
    return 3*w1**2 + tf.stop_gradient(2*w1*w2)

with tf.GradientTape() as tape:
    z = f2(w1, w2)
    
tape.gradient(z, [w1,w2])

[<tf.Tensor: shape=(), dtype=float32, numpy=30.0>, None]

show some numerical problems related computing gradients

In [319]:
def my_softplus(z): #line tf.nn.softplus()
    return tf.math.log(tf.exp(z) + 1.)

In [320]:
w = tf.Variable([100.])

my_softplus(w) # the result overloads the type capasity

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([inf], dtype=float32)>

In [321]:
# due to it the result of calulating gradients became NAN

with tf.GradientTape() as tape:
    z = my_softplus(w)

tape.gradient(z, [w])

[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([nan], dtype=float32)>]

to resolve that issue we can manualy find the derivative of my_softmax that is just 1/(1 + 1/exp(x)). Next we tell TF to use our derivative with tf.custom_gradient

In [352]:
@tf.custom_gradient
def my_better_softplus(z):
    exp = tf.exp(z)
    def my_softplus_gradients(grad):
        return grad/(1+ 1/exp)
    return tf.math.log(exp + 1), my_softplus_gradients

with tf.GradientTape() as tape:
    z = my_better_softplus(w)
    
print(tape.gradient(z, [w]))

[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.5000125], dtype=float32)>]


function my_softplus_gradients receive as argument 'grad' that where backpropagated so far, down to the softplus function. According chain rule we should multiply them with this function's gradients.


### Custom Training Loops

In [35]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [465]:
l2_reg = keras.regularizers.l2(0.05)
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="elu", 
                       kernel_initializer="he_normal",
                      kernel_regularizer=l2_reg),
    keras.layers.Dense(1, kernel_regularizer=l2_reg)
])

In [395]:
def random_batch(x, y, batch_size=32):
    idx = np.random.randint(len(x), size=batch_size)
    return x[idx], y[idx]

In [396]:
def print_status_bar(iteration, total, loss, metrics=None):
    metrics = " - ".join(["{}: {:.4f}".format(m.name, m.result())
                         for m in [loss] + (metrics or [])])
    end = "" if iteration < total else "\n"
    print("\r{}/{} - ".format(iteration, total) + metrics,
          end=end)

In [466]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = keras.optimizers.Nadam(lr=0.01)
loss_fn = keras.losses.mean_squared_error
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.MeanAbsoluteError()]

let's build the custom loop

In [467]:
for epoch in range(1, n_epochs + 1):
    print("Epoch {}/{}".format(epoch, n_epochs))
    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train_scaled, y_train)
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        mean_loss(loss)
        for metric in metrics:
            metric(y_batch, y_pred)
        print_status_bar(step * batch_size, len(y_train), mean_loss, metrics)
    print_status_bar(len(y_train), len(y_train), mean_loss, metrics)
    for metric in [mean_loss] + metrics:
        metric.reset_states()

Epoch 1/5


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

11610/11610 - mean: 1.4430 - mean_absolute_error: 0.5821
Epoch 2/5
11610/11610 - mean: 0.6749 - mean_absolute_error: 0.5240
Epoch 3/5
11610/11610 - mean: 0.6416 - mean_absolute_error: 0.5140
Epoch 4/5
11610/11610 - mean: 0.6337 - mean_absolute_error: 0.5143
Epoch 5/5
11610/11610 - mean: 0.6640 - mean_absolute_error: 0.5249


# TensorFlow Functions and Graphs

In [468]:
def cube(a):
    return a**3

useful things

In [470]:
auto1 = cube(2.)
auto2 = cube(tf.constant(2.))
func1 = tf.function(cube)
auto3 = func1(tf.constant(2.))
auto4 = func1(2.)
auto1, auto2, auto3, auto4, func1

(8.0,
 <tf.Tensor: shape=(), dtype=float32, numpy=8.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=8.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=8.0>,
 <tensorflow.python.eager.def_function.Function at 0x7fb5303de160>)

In [474]:
@tf.function
def tf_cube(a):
    return a**3

In [475]:
tf_cube.python_function(2)

8

In [477]:
def cube(a):
    return a**3

tf_func = tf.function(cube)

<tf.Tensor: shape=(), dtype=float32, numpy=8.0>

In [479]:
tf_cube(tf.constant(10))       #1
tf_cube(tf.constant(20))       #2
tf_cube(tf.constant([10, 20])) #3

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1000, 8000], dtype=int32)>

In [480]:
tf.autograph.to_code(tf_cube.python_function)

"def tf__tf_cube(a):\n  do_return = False\n  retval_ = ag__.UndefinedReturnValue()\n  with ag__.FunctionScope('tf_cube', 'fscope', ag__.ConversionOptions(recursive=True, user_requested=True, optional_features=(), internal_convert_user_code=True)) as fscope:\n    do_return = True\n    retval_ = fscope.mark_return_value(a ** 3)\n  do_return,\n  return ag__.retval(retval_)\n"

# Tasks

### task1

*How would you describe TensorFlow in a short sentence? What are its main features? Can you name other popular Deep Learning libraries?*

TF - это фреймвор, предоставляющий мощное API для математических вычислений (аналогично numpy), адаптированное на вычисления в области машинного обучения, включая оптимизацию вычислений, использование GPU, возмжоность запуска обученной модели на другой оборудовании и т.п. Другие популярные библиотеки: Theano,  Microsoft Cognitive Toolkit (CNTK), MXNet, Caffe2, Chainer.

### task2 

*Is TensorFlow a drop-in replacement for NumPy? What are the main differences between the two?*

Хоть TF и реализут большинство фукнциональности NumPy, но:
- названия функций отличаются;
- поведения фукнций отличаются, т.к. функции TF реализованы так, чтобы выполняться многопоточно, либо на GPU. Например tf.reduce_sum против np.sum или например, транспонирование в numpy фактически не осуществялется, а только предоставляет возможность обращаться к матрица, так, как если бы она была транспонирована, а вот в tf осуществляется фактически.
- TF имеет константные тензора, которые не могут изменяться.

### task3

*Do you get the same result with tf.range(10) and tf.constant( np.arange(10))?*

Нет, 1й возвращает тензор с типом int32, второй int64

### task4

*Can you name six other data structures available in TensorFlow, beyond regular tensors?*

- tf.SparseTensor
- tf.TensorArray
- tf.RaggedTensor
- tf.string
- tf.sets 
- tf.queue

### task5

*A custom loss function can be defined by writing a function or by subclassing the keras.losses.Loss class. When would you use each option?*

Наследование keras.losses.Loss используется, когда подсчет потерь на базе предсказанного значения и значения label недостаточно и нужно использовать гиперпараметры. Так же наследование дает больше возможностей при сохранении и загрузки модели.

### task 6

*Similarly, a custom metric can be defined in a function or a subclass of keras.metrics.Metric. When would you use each option?*

Аналогично ответу 5. 

Moreover, if computing the metric over a whole epoch is not equivalent to computing the mean metric over all batches in that epoch (e.g., as for the precision and recall metrics), then you should subclass the keras.metrics.Metric class and implement the __init__(), update_state(), and result() methods to keep track of a running metric during each epoch. You should also implement the reset_states() method unless all it needs to do is reset all variables to 0.0. If you want the state to be saved along with the model, then you should implement the get_config() method as well.

### tasl 7

*When should you create a custom layer versus a custom model?*

В целом различие в технологии. Мы должны представить модель в целом и отделить то, что является слоями от того, что является моделью. Технически, с точки зрения библиотеки слои и модель почти одно и то же, только модель обладает соовтетствующими функциями (compile, fit...)

You should distinguish the internal components of your model (i.e., layers or reusable blocks of layers) from the model itself (i.e., the object you will train). The former should subclass the keras.layers.Layer class, while the latter should subclass the keras.models.Model class.

### task 8

*What are some use cases that require writing your own custom training loop?*

Получить полный контроль за ходом обучения, чтобы быть уверенным что все выполняется с точностью так, как запланировано. Осуществить оладку, выводя доп. информацию в процессе обучения. Керас предоставляет много возможностей по реализации различных задач без необходимости выполнять custom loop: обратные вызовы, пользовательская регуляризация, пользовательские фукнции потерь и т.п., поэтому рекомендуется этого не делать, т.к. этот подход более подвержен ошибкам.

However, in some cases writing a custom training loop is necessary—for example, if you want to use different optimizers for different parts of your neural network, like in the Wide & Deep paper

### tasl 9

*Can custom Keras components contain arbitrary Python code, or must they be convertible to TF Functions?*

Необходимо конвертиться в TF фукнции, т.к. TF насыщает их TF операциями, которые могут быть использованы для построения и последующей оптимизации графа вычислений. Однако есть пути использования обычных операций Python/Numpy.

If you absolutely need to include arbitrary Python code in a custom component, you can either wrap it in a tf.py_function() operation (but this will reduce performance and limit your model’s portability) or set dynamic=True when creating the custom layer or model (or set run_eagerly=True when calling the model’s compile() method).

### task 10

*What are the main rules to respect if you want a function to be convertible to a TF Function?*


- Если используется вызов из внешней библиотеки (напр. Numpy), то такая фукнция будте вызываться как есть и не будет являться частью графа. Поэтому нужно убедиться, что используется tf.reduce_sum вместо np.sum и т.п. (подробнее на стр.  409) 
- TF function, если ей необходимо создать переменную, должна это сделать только при 1м вызове, иначе будет исключение. Создания переменных в фукнция нужно стараться избегать, лучше их создавать в слоях в методе build() ).  
- исходный код фукнции должен быть доступен TF для парсинга.  
- TF парсит циклы, которые выполняются по объектам тензора или dataset (напр. for i in tf.range(x)). 
- Для производительности следует предпочитать векторизованную реализацию, чем использовать циклы. 


### task11

*When would you need to create a dynamic Keras model? How do you do that? Why not make all your models dynamic?*

Creating a dynamic Keras model can be useful for debugging, as it will not compile any custom component to a TF Function, and you can use any Python debugger to debug your code. It can also be useful if you want to include arbitrary Python code in your model (or in your training code), including calls to external libraries. To make a model dynamic, you must set dynamic=True when creating it. Alternatively, you can set run_eagerly=True when calling the model’s compile() method. Making a model dynamic prevents Keras from using any of TensorFlow’s graph features, so it will slow down training and inference, and you will not have the possibility to export the computation graph, which will limit your model’s portability.

### task 12

*Implement a custom layer that performs Layer Normalization (we will use this type of layer in Chapter 15):*

*a. The build() method should define two trainable weights α and β, both of shape input_shape[-1:] and data type tf.float32. α should be initialized with 1s, and β with 0s.*

*b. The call() method should compute the mean μ and standard deviation σ of each instance’s features. For this, you can use tf.nn.moments(inputs, axes=-1, keepdims=True), which returns the mean μ and the variance σ2 of all instances (compute the square root of the variance to get the standard deviation). Then the function should compute and return **α⊗(X - μ)/(σ + ε) + β**, where ⊗ represents itemwise multiplication and ε is a smoothing term (small constant to avoid division by zero, e.g., 0.001).*

In [271]:
class Layer12(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def build(self, batch_input_shape):
        self.a = self.add_weight(name='a_weight', shape=batch_input_shape[-1:], dtype=tf.float32, initializer="ones")
        self.b = self.add_weight(name='b_weight', shape=batch_input_shape[-1:], dtype=tf.float32, initializer="zeros")
        super().build(batch_input_shape)
        
    def call(self, inputs, training=None):
        mean = tf.reduce_mean(inputs, axis=-1, keepdims=True)
        std  = tf.math.reduce_std(inputs, axis=-1, keepdims=True)
        calc = tf.math.subtract(inputs, mean)
        std_ = tf.math.add(std, 0.001)
        calc = tf.math.divide(calc, std_)
        calc = tf.math.multiply(self.a, calc) 
        return tf.math.add(calc, self.b)

*c. Ensure that your custom layer produces the same (or very nearly the same) output as the  keras.layers.LayerNormalization layer.*

In [391]:
tf.random.set_seed(42)
np.random.seed(42)
tf.keras.backend.clear_session

In [200]:
model1 = keras.models.Sequential([keras.layers.LayerNormalization(input_shape=[8])])
norm_data_ln = model1.predict(X_train)

In [204]:
model2 = keras.models.Sequential([Layer12(input_shape=[8])])
norm_data_my = model2.predict(X_train)

In [238]:
delta = norm_data_ln - norm_data_my
delta = np.round(delta, 4)
mse   = mean_squared_error(norm_data_ln, norm_data_my)
mse = np.sqrt(mse)

count = 0
for row in delta:
    for cell in row:
        if cell != 0:
            count+=1


print(count)
print(mse)

59
4.3577916e-06


Различия между полученным нормализованными данными не велики, чему свидетельствует значение переменной count для 11610*8 объектов. Так же расстояние межу нормализациями представлно в виде метрики rmse.

### task13

*Train a model using a custom training loop to tackle the Fashion MNIST dataset (see Chapter 10).*

In [604]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()

In [605]:
X_train_full = X_train_full.astype(np.float32)
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test.astype(np.float32)

In [607]:
tf.random.set_seed(42)
np.random.seed(42)
tf.keras.backend.clear_session()

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    Layer12(),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax"),
])

*a. Display the epoch, iteration, mean training loss, and mean accuracy over each epoch (updated at each iteration), as well as the validation loss and accuracy at the end of each epoch.*

In [608]:
def random_batch(x, y, batch_size=32):
    idx = np.random.randint(len(x), size=batch_size)
    return x[idx], y[idx]

def print_status_bar(iteration, total, loss, metric, val_loss=None, val_metric=None):
    print("\r{}/{}: loss - {:.4f}, accuracy - {:.4f}".format(iteration, total, loss, metric), end="")
    if iteration == total:
        print(", validation loss - {:.4f}, accuracy - {:.4f}".format(val_loss, val_metric), end="\n")
        
def m_accuracy(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=1)#, output_type=y_true.dtype)
    counter = 0
    for i in range(len(y_true)):
        if tf.math.equal(y_pred[i], y_true[i]):
            counter = tf.add(counter, 1)
    return tf.divide(counter, len(y_true))

In [609]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // (batch_size)
optimizer = keras.optimizers.Nadam(lr=0.01)
loss_fn  = keras.losses.SparseCategoricalCrossentropy()
mean_loss = keras.metrics.Mean()
mean_acc  = keras.metrics.SparseCategoricalAccuracy()

for epoch in range(1, n_epochs + 1):
    print("Epoch {}/{}".format(epoch, n_epochs))
    los_dic = []
    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train, y_train)
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            loss = loss_fn(y_batch, y_pred)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        mean_loss(loss)
        mean_acc(y_batch, y_pred)
        print_status_bar(step * batch_size, len(y_train), 
                         mean_loss.result().numpy(), 
                         mean_acc.result().numpy())
        
    y_pred   = model(X_valid)
    val_loss = loss_fn(y_valid, y_pred)
    val_acc  = m_accuracy(y_valid, y_pred)
    
    print_status_bar(len(y_train), len(y_train), 
                     mean_loss.result().numpy(), 
                     mean_acc.result().numpy(), 
                     val_loss, val_acc)
    
    mean_loss.reset_states()
    mean_acc.reset_states()

Epoch 1/5
55000/55000: loss - 0.5117, accuracy - 0.8204, validation loss - 0.4154, accuracy - 0.8524
Epoch 2/5
55000/55000: loss - 0.4194, accuracy - 0.8512, validation loss - 0.4906, accuracy - 0.8408
Epoch 3/5
55000/55000: loss - 0.4058, accuracy - 0.8568, validation loss - 0.4089, accuracy - 0.8594
Epoch 4/5
55000/55000: loss - 0.3911, accuracy - 0.8649, validation loss - 0.4215, accuracy - 0.8612
Epoch 5/5
55000/55000: loss - 0.3901, accuracy - 0.8633, validation loss - 0.4068, accuracy - 0.8664


*b. Try using a different optimizer with a different learning rate for the upper layers and the lower layers.*

In [625]:
tf.random.set_seed(42)
np.random.seed(42)
tf.keras.backend.clear_session()

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    Layer12(),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax"),
])

for w in model.trainable_variables:
    print(w.name, w.shape)

layer12/a_weight:0 (784,)
layer12/b_weight:0 (784,)
dense/kernel:0 (784, 100)
dense/bias:0 (100,)
dense_1/kernel:0 (100, 100)
dense_1/bias:0 (100,)
dense_2/kernel:0 (100, 10)
dense_2/bias:0 (10,)


In [620]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // (batch_size)
loss_fn  = keras.losses.SparseCategoricalCrossentropy()
mean_loss = keras.metrics.Mean()
mean_acc  = keras.metrics.SparseCategoricalAccuracy()

optimizer1 = keras.optimizers.Nadam(lr=0.01)
optimizer2 = tf.keras.optimizers.SGD(lr=1e-4, momentum=0.9)

for epoch in range(1, n_epochs + 1):
    print("Epoch {}/{}".format(epoch, n_epochs))
    los_dic = []
    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train, y_train)
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            loss = loss_fn(y_batch, y_pred)
        gradients = tape.gradient(loss, model.trainable_variables)
        
        optimizer1.apply_gradients(zip(gradients[:4], model.trainable_variables[:4]))
        optimizer2.apply_gradients(zip(gradients[4:], model.trainable_variables[4:]))
        
        mean_loss(loss)
        mean_acc(y_batch, y_pred)
        print_status_bar(step * batch_size, len(y_train), 
                         mean_loss.result().numpy(), 
                         mean_acc.result().numpy())
        
    y_pred   = model(X_valid)
    val_loss = loss_fn(y_valid, y_pred)
    val_acc  = m_accuracy(y_valid, y_pred)
    
    print_status_bar(len(y_train), len(y_train), 
                     mean_loss.result().numpy(), 
                     mean_acc.result().numpy(), 
                     val_loss, val_acc)
    
    mean_loss.reset_states()
    mean_acc.reset_states()

Epoch 1/5
55000/55000: loss - 0.4550, accuracy - 0.8383, validation loss - 0.4095, accuracy - 0.8574
Epoch 2/5
55000/55000: loss - 0.3583, accuracy - 0.8720, validation loss - 0.4423, accuracy - 0.8580
Epoch 3/5
55000/55000: loss - 0.3281, accuracy - 0.8809, validation loss - 0.4146, accuracy - 0.8616
Epoch 4/5
55000/55000: loss - 0.3167, accuracy - 0.8839, validation loss - 0.3889, accuracy - 0.8736
Epoch 5/5
55000/55000: loss - 0.3068, accuracy - 0.8882, validation loss - 0.3916, accuracy - 0.8754


Не знаю корректный ли подход, который я использовал выше (вроде норм судя по ряду выполненных тестов). Но в учебных материалах использовался другой подход. Сравним результаты

In [632]:
tf.random.set_seed(42)
np.random.seed(42)
tf.keras.backend.clear_session()

model_bottom = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    Layer12(),
    keras.layers.Dense(100, activation="relu"),
])
model_upper = keras.models.Sequential([
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax"),
])
model = keras.models.Sequential([
    model_bottom, model_upper
])

n_epochs = 5
batch_size = 32
n_steps = len(X_train) // (batch_size)
loss_fn  = keras.losses.SparseCategoricalCrossentropy()
mean_loss = keras.metrics.Mean()
mean_acc  = keras.metrics.SparseCategoricalAccuracy()

optimizer1 = keras.optimizers.Nadam(lr=0.01)
optimizer2 = tf.keras.optimizers.SGD(lr=1e-4, momentum=0.9)

for epoch in range(1, n_epochs + 1):
    print("Epoch {}/{}".format(epoch, n_epochs))
    los_dic = []
    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train, y_train)
        with tf.GradientTape(persistent=True) as tape:
            y_pred = model(X_batch)
            loss = loss_fn(y_batch, y_pred)
        
        gradients = tape.gradient(loss, model_bottom.trainable_variables)
        optimizer1.apply_gradients(zip(gradients, model_bottom.trainable_variables))
        
        gradients = tape.gradient(loss, model_upper.trainable_variables)
        optimizer2.apply_gradients(zip(gradients, model_upper.trainable_variables))
        
        del gradients
        
        mean_loss(loss)
        mean_acc(y_batch, y_pred)
        print_status_bar(step * batch_size, len(y_train), 
                         mean_loss.result().numpy(), 
                         mean_acc.result().numpy())
        
    y_pred   = model(X_valid)
    val_loss = loss_fn(y_valid, y_pred)
    val_acc  = m_accuracy(y_valid, y_pred)
    
    print_status_bar(len(y_train), len(y_train), 
                     mean_loss.result().numpy(), 
                     mean_acc.result().numpy(), 
                     val_loss, val_acc)
    
    mean_loss.reset_states()
    mean_acc.reset_states()

Epoch 1/5
55000/55000: loss - 0.4550, accuracy - 0.8383, validation loss - 0.4095, accuracy - 0.8574
Epoch 2/5
55000/55000: loss - 0.3583, accuracy - 0.8720, validation loss - 0.4423, accuracy - 0.8580
Epoch 3/5
55000/55000: loss - 0.3281, accuracy - 0.8809, validation loss - 0.4146, accuracy - 0.8616
Epoch 4/5
55000/55000: loss - 0.3167, accuracy - 0.8839, validation loss - 0.3889, accuracy - 0.8736
Epoch 5/5
55000/55000: loss - 0.3068, accuracy - 0.8882, validation loss - 0.3916, accuracy - 0.8754


the same result here