In [4]:
import tensorflow as tf
import tensorflow.keras as keras

## One-Hot Encoding

In [10]:
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
table_init

<tensorflow.python.ops.lookup_ops.KeyValueTensorInitializer at 0x7f4025743e50>

In [13]:
for s in dir(table_init):
    if not s.startswith("_"):
        print(s)

initialize
key_dtype
value_dtype


In [14]:
n_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, n_oov_buckets)
table

<tensorflow.python.ops.lookup_ops.StaticVocabularyTable at 0x7f403431f150>

In [15]:
for s in dir(table):
    if not s.startswith("_"):
        print(s)

key_dtype
lookup
name
resource_handle
size
value_dtype


In [16]:
categories = tf.constant(["NEAR BAY", "near bay", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([3, 6, 5, 1, 1])>

Note that it's **case-sensitive**.

In [17]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab)+n_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(5, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

**(?)** Is there `keras.layers.TextVectorization` yet?

In [20]:
tf.__version__

'2.4.1'

In [21]:
[ s for s in dir(keras.layers) if s.startswith("T")]

['ThresholdedReLU', 'TimeDistributed']

In [22]:
[ s for s in dir(keras.layers.experimental.preprocessing) if s.startswith("T")]

['TextVectorization']

**(?)** Try the `adapt`, `call` methods of `TextVectorization`.<br>


In [23]:
tv_layer = keras.layers.experimental.preprocessing.TextVectorization()

In [24]:
tv_layer.adapt

<bound method TextVectorization.adapt of <tensorflow.python.keras.layers.preprocessing.text_vectorization.TextVectorization object at 0x7f40244c0dd0>>

## Embeddings


### Manually
- randomly initialized
- one row per category and per oov bucket
- one column per embedding dimension

In [28]:
tf.random.set_seed(42)
embedding_dim = 2
embedding_init = tf.random.uniform([len(vocab) + n_oov_buckets, embedding_dim])
embedding_init

<tf.Tensor: shape=(7, 2), dtype=float32, numpy=
array([[0.6645621 , 0.44100678],
       [0.3528825 , 0.46448255],
       [0.03366041, 0.68467236],
       [0.74011743, 0.8724445 ],
       [0.22632635, 0.22319686],
       [0.3103881 , 0.7223358 ],
       [0.13318717, 0.5480639 ]], dtype=float32)>

In [29]:
embedding_matrix = tf.Variable(embedding_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.6645621 , 0.44100678],
       [0.3528825 , 0.46448255],
       [0.03366041, 0.68467236],
       [0.74011743, 0.8724445 ],
       [0.22632635, 0.22319686],
       [0.3103881 , 0.7223358 ],
       [0.13318717, 0.5480639 ]], dtype=float32)>

In [30]:
categories

<tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'NEAR BAY', b'near bay', b'DESERT', b'INLAND', b'INLAND'],
      dtype=object)>

In [31]:
cat_indices

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([3, 6, 5, 1, 1])>

In [32]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[0.74011743, 0.8724445 ],
       [0.13318717, 0.5480639 ],
       [0.3103881 , 0.7223358 ],
       [0.3528825 , 0.46448255],
       [0.3528825 , 0.46448255]], dtype=float32)>

### Using `keras`
More precisely, `keras.layers.Embedding`.

In [35]:
tf.random.set_seed(42)
embedding = keras.layers.Embedding(input_dim=len(vocab)+n_oov_buckets,
                                   output_dim=embedding_dim)
embedding(cat_indices)

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[ 0.02401174,  0.03724445],
       [-0.03668128,  0.00480639],
       [-0.01896119,  0.02223358],
       [-0.01471175, -0.00355174],
       [-0.01471175, -0.00355174]], dtype=float32)>