# Tensorflow Categorical Encoding

---
# Data

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

# Construct a tf.data.Dataset
train, info = tfds.load(
    'titanic:2.*.*',              # Name of the dataset
    with_info=True,       # Information of the dataset
    shuffle_files=True, 
    split='train[:90%]'
)
validation = tfds.load(
    'titanic:2.*.*',              # Name of the dataset
    with_info=False,       # Information of the dataset
    shuffle_files=True, 
    split='train[:10%]'
)
info

2021-11-01 21:52:56.654208: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


tfds.core.DatasetInfo(
    name='titanic',
    full_name='titanic/2.0.0',
    description="""
    Dataset describing the survival status of individual passengers on the Titanic. Missing values in the original dataset are represented using ?. Float and int missing values are replaced with -1, string missing values are replaced with 'Unknown'.
    """,
    homepage='https://www.openml.org/d/40945',
    data_path='/home/oonisim/tensorflow_datasets/titanic/2.0.0',
    download_size=114.98 KiB,
    dataset_size=532.14 KiB,
    features=FeaturesDict({
        'features': FeaturesDict({
            'age': tf.float32,
            'boat': tf.string,
            'body': tf.int32,
            'cabin': tf.string,
            'embarked': ClassLabel(shape=(), dtype=tf.int64, num_classes=4),
            'fare': tf.float32,
            'home.dest': tf.string,
            'name': tf.string,
            'parch': tf.int32,
            'pclass': ClassLabel(shape=(), dtype=tf.int64, num_classes=3),
       

In [2]:
train = train.map(lambda row: (row['features'], row['survived']))
tf.data.experimental.get_structure(train)

({'age': TensorSpec(shape=(), dtype=tf.float32, name=None),
  'boat': TensorSpec(shape=(), dtype=tf.string, name=None),
  'body': TensorSpec(shape=(), dtype=tf.int32, name=None),
  'cabin': TensorSpec(shape=(), dtype=tf.string, name=None),
  'embarked': TensorSpec(shape=(), dtype=tf.int64, name=None),
  'fare': TensorSpec(shape=(), dtype=tf.float32, name=None),
  'home.dest': TensorSpec(shape=(), dtype=tf.string, name=None),
  'name': TensorSpec(shape=(), dtype=tf.string, name=None),
  'parch': TensorSpec(shape=(), dtype=tf.int32, name=None),
  'pclass': TensorSpec(shape=(), dtype=tf.int64, name=None),
  'sex': TensorSpec(shape=(), dtype=tf.int64, name=None),
  'sibsp': TensorSpec(shape=(), dtype=tf.int32, name=None),
  'ticket': TensorSpec(shape=(), dtype=tf.string, name=None)},
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [3]:
count = 0
for row in train:
    count +=1
    
count

2021-11-01 21:52:57.193937: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


1178

In [4]:
validation = validation.map(lambda row: (row['features'], row['survived']))

In [5]:
count = 0
for row in validation:
    count +=1
    
count

131

## Examin dataset

In [6]:
[(train_features, label_batch)] = train.batch(5).take(1)
print('Features:', list(train_features.keys()))
print('A batch of ages:', train_features['age'])
print('A batch of targets:', label_batch )

Features: ['age', 'boat', 'body', 'cabin', 'embarked', 'fare', 'home.dest', 'name', 'parch', 'pclass', 'sex', 'sibsp', 'ticket']
A batch of ages: tf.Tensor([30. 37. 28. 18. -1.], shape=(5,), dtype=float32)
A batch of targets: tf.Tensor([0 0 1 0 0], shape=(5,), dtype=int64)


## Keras layer to convert categorical into MHE

Convert a TF dataset categorical column (single TF Tensor) into MHE columns (single Tensor having multiple columns).

In [7]:
def get_category_encoding_layer(dataset, name, dtype, max_tokens=None, oov_token=None):
    """Create a Keras layer to convert a column into Multi Hot Encoding.
    The layer function as below.
    1. Convert string/integer in the target column (dataset[name]) into indices.
       e.g. ['cat', 'dog', 'fish', 'bird', 'ant'] into [0,1,2,3,4]
    2. Convert indices in the column into Multi Hot Encoding.
    
    Args:
        dataset: TF Dataset that have the target column against which to create the category_encoding_layer.
        name: The name that identifies the target column in the dataset.
        max_tokens: 
            Use the top max_token most frequent tokens are used to create the vocabulary. 
            All others will be treated as out-of-vocabulary (OOV).

    Returns: Keras layer to function as category encoder.
    """
    if dtype == 'string':
    # Create a layer that turns strings into integer indices.
        oov_token = oov_token if oov_token is not None and isinstance(oov_token, str) else '[UNK]'
        lookup = tf.keras.layers.StringLookup(max_tokens=max_tokens, oov_token=oov_token)
    else:
        # Otherwise, create a layer that turns integer values into integer indices.
        oov_token = oov_token if oov_token is not None and isinstance(oov_token, (inf, float)) else -1
        lookup = tf.keras.layers.IntegerLookup(max_tokens=max_tokens, oov_token=oov_token)

    # Extract the target feature column by "name" from the "dataset"
    feature = dataset.map(lambda features, label: features[name])

    # Fit the lookup table (string -> int) to the values in the feature column.
    lookup.adapt(feature)

    # Encode the integer indices. Multi Hot to save the space.
    # encoder = tf.keras.layers.CategoryEncoding(num_tokens=lookup.vocabulary_size(), output_mode='multi_hot')
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=lookup.vocabulary_size(), output_mode='multi_hot')

    def f(column):
        """Apply multi-hot encoding"""
        return encoder(lookup(column))

    return f

In [8]:
# Test the string categorical 'Type' column conversion into MHE'
tensor_column_categorical_cabin = tf.constant([
    [cabin.numpy()] for cabin in train_features['cabin']
])

test_cabin_layer = get_category_encoding_layer(
    dataset=train,
    name='cabin',
    dtype='string',
    max_tokens=5
)
tensor_column_mhe_cabin = test_cabin_layer(tensor_column_categorical_cabin)

for i in range(len(tensor_column_categorical_cabin)):
    print("{} : {}".format(
    tensor_column_categorical_cabin[i].numpy(),
    tensor_column_mhe_cabin[i].numpy()
))

[b'Unknown'] : [0. 1. 0. 0. 0.]
[b'Unknown'] : [0. 1. 0. 0. 0.]
[b'Unknown'] : [0. 1. 0. 0. 0.]
[b'Unknown'] : [0. 1. 0. 0. 0.]
[b'Unknown'] : [0. 1. 0. 0. 0.]


In [9]:
tensor_column_categorical_embarked = tf.constant([
    [col.numpy()] for col in train_features['embarked']
])

test_embarked_layer = get_category_encoding_layer(
    dataset=train,
    name='embarked',
    dtype='int64',
    max_tokens=None
)
tensor_column_mhe_embarked = test_embarked_layer(tensor_column_categorical_embarked)

for i in range(len(tensor_column_categorical_embarked)):
    print("{} : {}".format(
    tensor_column_categorical_embarked[i].numpy(),
    tensor_column_mhe_embarked[i].numpy()
))
    
del test_embarked_layer, tensor_column_categorical_embarked, tensor_column_mhe_embarked

[2] : [0. 1. 0. 0. 0.]
[2] : [0. 1. 0. 0. 0.]
[2] : [0. 1. 0. 0. 0.]
[2] : [0. 1. 0. 0. 0.]
[0] : [0. 0. 1. 0. 0.]


## Keras layer to normalize numeric values

In [10]:
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    normalizer = tf.keras.layers.Normalization(axis=None)

    # Prepare a Dataset that only yields the feature.
    feature = dataset.map(lambda features, label: features[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature)

    return normalizer

In [11]:
tensor_column_categorical_age = tf.constant([
    [col.numpy()] for col in train_features['age']
])
test_norm_layer = get_normalization_layer('age', train)
tensor_column_mhe_age = test_norm_layer(tensor_column_categorical_age)

for i in range(len(tensor_column_categorical_age)):
    print("{} : {}".format(
    tensor_column_categorical_age[i].numpy(),
    tensor_column_mhe_age[i].numpy()
))

[30.] : [0.33245727]
[37.] : [0.7231315]
[28.] : [0.22083609]
[18.] : [-0.3372699]
[-1.] : [-1.3976712]


---
# Training

## Split data into training, validation, and test

In [12]:
batch_size = 32
train = train.batch(batch_size).shuffle(buffer_size=32).prefetch(1)
validation = validation.batch(batch_size).shuffle(buffer_size=32).prefetch(1)

## Keras model  

In [13]:
all_inputs = []
encoded_features = []

### Holizontal Keras preprocessing layers for numerical normalization

In [14]:
# Numerical features.
for header in ['age', 'fare']:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

### Holizontal Keras preprocessing layers for numerical categorical into MHE

In [15]:
for header in ['pclass', 'sex', 'sibsp']:
    numeric_input_feature = tf.keras.Input(shape=(1,), name=header, dtype='int64')
    numeric_category_encoding_layer = get_category_encoding_layer(
        name=header,
        dataset=train,
        dtype='int64',
        max_tokens=None
    )
    categorically_encoded_feature = numeric_category_encoding_layer(numeric_input_feature)
    all_inputs.append(numeric_input_feature)
    encoded_features.append(categorically_encoded_feature)
    
for header in ['embarked']:
    numeric_input_feature = tf.keras.Input(shape=(1,), name=header, dtype='int64')
    numeric_category_encoding_layer = get_category_encoding_layer(
        name=header,
        dataset=train,
        dtype='int64',
        max_tokens=None
    )
    categorically_encoded_feature = numeric_category_encoding_layer(numeric_input_feature)
    all_inputs.append(numeric_input_feature)
    encoded_features.append(categorically_encoded_feature)

### Holizontal Keras preprocessing layers for String categorical into MHE

In [16]:
string_categorical_columns = [
    'boat', 'cabin'
]

for column_name in string_categorical_columns:
    string_input_feature = tf.keras.Input(shape=(1,), name=column_name, dtype='string')

    # String category encoding layer
    string_category_encoding_layer = get_category_encoding_layer(
        name=column_name,
        dataset=train,
        dtype='string',
        max_tokens=5,
        oov_token='[UNK]'
    )
    # Categorical encoding
    categorically_encoded_feature = string_category_encoding_layer(string_input_feature)

    all_inputs.append(string_input_feature)
    encoded_features.append(categorically_encoded_feature)

In [17]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [18]:
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

## Keras Model Training

In [19]:
# Use `rankdir='LR'` to make the graph horizontal.
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [23]:
model.fit(
    x=train, 
    epochs=50,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            verbose=1, 
            mode='min',
            restore_best_weights=True
        )
    ],
    validation_data=validation
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping


<keras.callbacks.History at 0x7f0680e604f0>

In [21]:
!mkdir -p model
model.save('model/titanic_classifier_model')
reloaded_model = tf.keras.models.load_model('model/titanic_classifier_model')

#del train, model

2021-11-01 21:53:17.796506: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: model/pet_classifier_model/assets


INFO:tensorflow:Assets written to: model/pet_classifier_model/assets


# Prediction