In [1]:
import tensorflow as tf
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()

X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

### The Normalization Layer

In [2]:
norm_layer = tf.keras.layers.Normalization()
model = tf.keras.models.Sequential([
    norm_layer,
    tf.keras.layers.Dense(1)
])

model.compile(
    loss = "mse",
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
)

norm_layer.adapt(X_train) # computes the mean and verience of each feature
history = model.fit(X_train, y_train, epochs=3, validation_data=(X_valid, y_valid))

Epoch 1/3
Epoch 2/3
Epoch 3/3


Since we included the Normalization layer inside the model, we can now deploy this model for production. No need to worry about the Normalization again. But this method makes the model training slower, we can also do in an another way which will make the training faster.

In [3]:
norm_layer = tf.keras.layers.Normalization()
norm_layer.adapt(X_train)

X_train_scaled = norm_layer(X_train)
X_valid_scaled = norm_layer(X_valid)

We can now train our model on the scaled data, this time without thr Normalization layer

In [4]:
model = tf.keras.models.Sequential([tf.keras.layers.Dense(1)])
model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(learning_rate=0.001))

history = model.fit(X_train_scaled, y_train, epochs=3, validation_data=(X_valid_scaled, y_valid))

Epoch 1/3
Epoch 2/3
Epoch 3/3


This method should speed up the training a bit. But this time the model won't process Normalization it's inputs when we deploy the model. To fix this we need to create a new model that wraps both adapted Normalization and the model.

In [5]:
final_model = tf.keras.Sequential([norm_layer,model])

X_new = X_test[:3]
y_pred = final_model(X_new)
y_pred

<tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[0.8812941],
       [1.4023672],
       [2.0182023]], dtype=float32)>

In [6]:
# custom Normalization

import numpy as np

class MyNormalization(tf.keras.layers.Layer):
    def adapt(self, X):
        self.mean_ = np.mean(X, axis=0, keepdims=True)
        self.std_ = np.std(X, axis=0, keepdims=True)
        
    def call(self, inputs):
        eps = tf.keras.backend.epsilon()
        return (inputs - self.mean_) / (self.std_ + eps)


In [7]:
my_norm = MyNormalization()
my_norm.adapt(X_train)

X_train_scaled = my_norm(X_train)
X_valid_scaled = my_norm(X_valid)

model = tf.keras.models.Sequential([tf.keras.layers.Dense(1)])
model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(learning_rate=0.001))

history = model.fit(X_train_scaled, y_train, epochs=3, validation_data=(X_valid_scaled, y_valid))

Epoch 1/3
Epoch 2/3
Epoch 3/3


### The Discretization Layer
Map numerical feature to catagorical feature by mapping range values (bins). This is sometimes useful, especially when any feature has highly non-lenear relationship with the target. The following example convert the code into three catagories: less than 18, 18 to 50, 50 or more.

In [8]:
age = tf.constant([[10.],[93.],[18.],[37.],[5.]])
discretize_layer = tf.keras.layers.Discretization(bin_boundaries=[18., 50.])

age_categories = discretize_layer(age)
age_categories

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[0],
       [2],
       [1],
       [1],
       [0]], dtype=int64)>

We could also set the number of bins, and then call the adapt() method to let it find the appropriate bin boundaries. For instance, we if set num_bins=3, then bin boundaries will be located at 33 and 66 parcentile (in this example, at the values 10 and 37).

In [9]:
discretize_layer = tf.keras.layers.Discretization(num_bins=3)
discretize_layer.adapt(age)

age_categories = discretize_layer(age)
age_categories

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[1],
       [2],
       [1],
       [2],
       [0]], dtype=int64)>

### The CategoryEncoding Layer

In [10]:
onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3)
onehot_layer(age_categories)

<tf.Tensor: shape=(5, 3), dtype=float32, numpy=
array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)>

To encode more than one categorical feature at a time which only make sense if they all use same categories, the CategoryEncoding class will perform multi-hot encoding by default.

In [11]:
two_age_categories = np.array([[1,0], [2,2], [2,0]])
onehot_layer(two_age_categories)

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 1., 0.],
       [0., 0., 1.],
       [1., 0., 1.]], dtype=float32)>

### The StringLookup Layer

In [12]:
cities = ["Auckland", "Paris", "Paris", "San Francisco"]

str_lookup_layer = tf.keras.layers.StringLookup()
str_lookup_layer.adapt(cities)
str_lookup_layer([["Auckland", "Paris", "Paris", "San Francisco"]])

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[3, 1, 1, 2]], dtype=int64)>

In [13]:
str_lookup_layer([["Paris", "Auckland", "Auckland", "Montreal"]])

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[1, 3, 3, 0]], dtype=int64)>

The known categoeies are start mapping from 1 (most frequent category to least category). Unknown categoeies are mapped as 0, in this case 'Montreal' is mapped as 0.

In [15]:
str_lookup_layer = tf.keras.layers.StringLookup(output_mode="one_hot")
str_lookup_layer.adapt(cities)
str_lookup_layer([["Paris"], ["Auckland"], ["Auckland"], ["Montreal"]])



<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]], dtype=float32)>

If the training set is very large, it may be convenient to adapt the layer to just a random subset of the training set. In that case, the layers adapt() method may miss some of rarer categories. By default, it will map them all to 0 which will make them undistingguishable. To reduce this risk, we can set num_oov_indices to an integer greater than 1.

In [16]:
str_lookup_layer = tf.keras.layers.StringLookup(num_oov_indices=5)
str_lookup_layer.adapt(cities)
str_lookup_layer([["Paris"], ["Auckland"], ["Foo"], ["Bar"], ["Baz"]])



<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[5],
       [7],
       [4],
       [3],
       [4]], dtype=int64)>

Since there are five OOV buckets, the first known category's (Paris) id is now 5. But "Foo", "Baz" and "Bar" are unknown, so thay get mapped to one of the OOV buckets. "Bar" gets mapped to id 3, but unfortunately "Foo", "Baz" gets mapped in the same bucket (4). This is called a hashing collision.

### The Hashing Layer

In [17]:
hashing_layer = tf.keras.layers.Hashing(num_bins=10)
hashing_layer([["Paris"], ["Tokyo"], ["Auckland"], ["Montreal"]])

<tf.Tensor: shape=(4, 1), dtype=int64, numpy=
array([[0],
       [1],
       [9],
       [1]], dtype=int64)>

We again got hashing collision, "Tokyo" and "Montreal" get mapped in the same bucket. So, its better to get stick to the StringLookup layer.