# Chapter 12 exercise 12

### Implement a custom layer that performs Layer Normalization:

#### a. The build() method should define two trainable weights $\alpha$ and $\beta$, both of shape input_shape[-1:] and data type tf.float32. $\alpha$ should be initialized with 1s and $\beta$ with 0s

#### b. The call() method should compute the mean $\mu$ and standard deviation $\sigma$ of each instance's features. For this, you can use tf.nn.moments(inputs, axes=-1, keepdims=True), which returns the mean $\mu$ and the variance $\sigma^{2}$ of all instances (compute the square root of the variance to get the standard deviation). Then the function should compute and return $\alpha$ $\circ$ (X - $\mu$)/($\sigma$ + $\epsilon$) + $\beta$, where $\circ$ represents itemwise multiplication (*) and $\epsilon$ is a smoothing term (small constant to avoid division by zero, e.g., 0.001).
  
#### c. Ensure that your custom layer produces the same (or very nearly the same) output as the keras.layers.LayerNormalization layer.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize']=(12,6)

In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [3]:
class MyLayerNormalization(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
        
    def build(self, batch_input_shape):
        self.alpha = self.add_weight(name="alpha", 
                                     shape=batch_input_shape[-1:],
                                     initializer="ones",
                                     dtype=tf.float32
                                     )
        
        self.beta = self.add_weight(name='beta', 
                                    shape=batch_input_shape[-1:], 
                                    initializer='zeros',
                                    dtype=tf.float32)
        super().build(batch_input_shape)
        
    def call(self, X):
        mu, var = tf.nn.moments(X, axes=-1, keepdims=True)
        sigma = tf.sqrt(var)
        
        return self.alpha * (X-mu)/(sigma + self.eps) + self.beta
    
    def compute_output_shape(self, batch_input_shape):
        return batch_input_shape
        
    def get_config(self):
        base_config = super().get_config()
        return {**base_config}    

In [4]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [5]:
input_shape = X_train.shape[1:]

model = keras.models.Sequential([
    MyLayerNormalization(30, input_shape=input_shape),
    MyLayerNormalization(1)
])

TypeError: __init__() takes 1 positional argument but 2 were given

In [None]:
model.compile(loss='mse', optimizer='nadam')
model.fit(X_train_scaled, y_train, epochs=2, validation_data=(X_valid_scaled, y_valid))
model.evaluate(X_test_scaled, y_test)

## Author's solution

In [6]:
class LayerNormalization(keras.layers.Layer):
    def __init__(self, eps=0.001, **kwargs):
        super().__init__(**kwargs)
        self.eps = eps

    def build(self, batch_input_shape):
        self.alpha = self.add_weight(
            name="alpha", shape=batch_input_shape[-1:],
            initializer="ones")
        self.beta = self.add_weight(
            name="beta", shape=batch_input_shape[-1:],
            initializer="zeros")
        super().build(batch_input_shape) # must be at the end

    def call(self, X):
        mean, variance = tf.nn.moments(X, axes=-1, keepdims=True)
        return self.alpha * (X - mean) / (tf.sqrt(variance + self.eps)) + self.beta

    def compute_output_shape(self, batch_input_shape):
        return batch_input_shape

    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "eps": self.eps}

In [7]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [8]:
X = X_train.astype(np.float32)

custom_layer_norm = LayerNormalization()
keras_layer_norm = keras.layers.LayerNormalization()

tf.reduce_mean(keras.losses.mean_absolute_error(
    keras_layer_norm(X), custom_layer_norm(X)))

<tf.Tensor: shape=(), dtype=float32, numpy=4.817434e-08>

In [9]:
random_alpha = np.random.rand(X.shape[-1])
random_beta = np.random.rand(X.shape[-1])

custom_layer_norm.set_weights([random_alpha, random_beta])
keras_layer_norm.set_weights([random_alpha, random_beta])

tf.reduce_mean(keras.losses.mean_absolute_error(
    keras_layer_norm(X), custom_layer_norm(X)))

<tf.Tensor: shape=(), dtype=float32, numpy=1.927576e-08>

### Adding in training to try to figure out the issue with above custom layer

In [10]:
input_shape = X_train.shape[1:]

model = keras.models.Sequential([
    LayerNormalization(30, input_shape=input_shape),
    LayerNormalization(1)
])

In [11]:
model.compile(loss='mse', optimizer='nadam')
model.fit(X_train_scaled, y_train, epochs=2, validation_data=(X_valid_scaled, y_valid))
model.evaluate(X_test_scaled, y_test)

Epoch 1/2
Epoch 2/2


2.5483274459838867

## A different attempt

In [18]:
class MyLayerNormalization(keras.layers.Layer):
    def __init__(self):
        super().__init__()
        
        
    def build(self, batch_input_shape):
        self.alpha = self.add_weight(name="alpha", 
                                     shape=batch_input_shape[-1:],
                                     initializer="ones",
                                     dtype=tf.float32
                                     )
        
        self.beta = self.add_weight(name='beta', 
                                    shape=batch_input_shape[-1:], 
                                    initializer='zeros',
                                    dtype=tf.float32)
        super().build(batch_input_shape)
        
    def call(self, X):
        mu, var = tf.nn.moments(X, axes=-1, keepdims=True)
        sigma = tf.sqrt(var)
        
        return self.alpha * (X-mu)/(sigma + 0.0001) + self.beta
    
    def compute_output_shape(self, batch_input_shape):
        return batch_input_shape
        
    def get_config(self):
        base_config = super().get_config()
        return {**base_config}    

In [19]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [20]:
input_shape = X_train.shape[1:]

model = keras.models.Sequential([
    LayerNormalization(30, input_shape=input_shape),
    LayerNormalization(1)
])

In [21]:
model.compile(loss='mse', optimizer='nadam')
model.fit(X_train_scaled, y_train, epochs=2, validation_data=(X_valid_scaled, y_valid))
model.evaluate(X_test_scaled, y_test)

Epoch 1/2
Epoch 2/2


2.5483274459838867

Issue has to do with __init__, but not sure why excluding **kwargs solves it.