## TextVectorization

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization
import string
import re

In [2]:
sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'

In [3]:
vocab_size = 8
sequence_length = 5

vectorize_layer = TextVectorization(max_tokens=vocab_size,
                                    output_mode="int",
                                    output_sequence_length=sequence_length)

vectorize_layer.adapt([sample1, sample2])

In [4]:
vocabs = vectorize_layer.get_vocabulary()
print(vocabs)

['', '[UNK]', 'ai', 'we', 'topic', 'learning', 'is', 'cs']


In [5]:
sample1_vector = vectorize_layer('We are learning AI')
print(sample1_vector)

tf.Tensor([3 1 5 2 0], shape=(5,), dtype=int64)


In [6]:
sample2_vector = vectorize_layer('AI is a CS topic')
print(sample2_vector)

tf.Tensor([2 6 1 7 4], shape=(5,), dtype=int64)


In [7]:
test_vector = vectorize_layer('Learning AI is difficult!')
print(test_vector) #???

tf.Tensor([5 2 6 1 0], shape=(5,), dtype=int64)


In [8]:
# decode

index_vocab = {i: v for i, v in enumerate(vocabs)}
print(index_vocab)

{0: '', 1: '[UNK]', 2: 'ai', 3: 'we', 4: 'topic', 5: 'learning', 6: 'is', 7: 'cs'}


In [11]:
a_vector = [5, 2, 6, 1, 0]
print(a_vector)

[5, 2, 6, 1, 0]


In [12]:
text = [index_vocab[i] for i in a_vector]
print(text)

['learning', 'ai', 'is', '[UNK]', '']


## Using [Start] Token

In [1]:
embedding_dim = 4
vocab_size = 8
sequence_length = 5

import tensorflow as tf
from tensorflow.keras.layers import Embedding

layer = Embedding(vocab_size, embedding_dim)

In [2]:
a_vector = [[4., 3.0, 1.0, 5.0, 4.0]]
a_vector = tf.convert_to_tensor(a_vector)
print(a_vector)

tf.Tensor([[4. 3. 1. 5. 4.]], shape=(1, 5), dtype=float32)


In [3]:
output = layer(a_vector)
print(output)

tf.Tensor(
[[[-0.02545398 -0.01913146  0.01199605 -0.03054712]
  [ 0.03966483 -0.01925439  0.0141543   0.0195548 ]
  [ 0.03752751  0.01604519  0.03197367 -0.01292641]
  [-0.01600847 -0.02929524  0.01094943  0.03886148]
  [-0.02545398 -0.01913146  0.01199605 -0.03054712]]], shape=(1, 5, 4), dtype=float32)


In [4]:
layer.weights[0]

<tf.Variable 'embedding/embeddings:0' shape=(8, 4) dtype=float32, numpy=
array([[ 0.04684255, -0.00200953, -0.04727422,  0.00881772],
       [ 0.03752751,  0.01604519,  0.03197367, -0.01292641],
       [-0.00208209,  0.03913813,  0.0018545 ,  0.01120707],
       [ 0.03966483, -0.01925439,  0.0141543 ,  0.0195548 ],
       [-0.02545398, -0.01913146,  0.01199605, -0.03054712],
       [-0.01600847, -0.02929524,  0.01094943,  0.03886148],
       [-0.02353916, -0.04173622,  0.01859237, -0.00422686],
       [-0.02809455, -0.03801959,  0.04050454,  0.02462274]],
      dtype=float32)>