# Word indexing

Convert words in a corpus into integer word indices as a preprocess to vectorize a word (1-gram), n-grams, or  a sentence.

In [2]:
import numpy as np
import tensorflow as tf

---

# Simple implementation

In [34]:
from typing import (
    Dict,
    List
)
import re
import string

def standardize(text: str) -> str:
    """Standardize the text
    1. Lower the string
    2. Remove punctuation
    3. Remove white space, new lines, carriage returns
    Args:
        text: sequence of words
    Returns
        standardized: standardized text
    """
    assert isinstance(text, str) and len(text) > 0
    replacement = " "
    pattern: str = '[%s%s]+' % (re.escape(string.punctuation), r"\s")

    standardized: str = re.compile(pattern).sub(repl=replacement, string=text).lower().strip()
    assert len(standardized) > 0, f"Text [{text}] needs words other than punctuations."
    return standardized


def word_indexing(corpus: str):
    """Generate word indices
    Args:
        corpus: A string including sentences to process.
    Returns:
        vocabulary: unique words in the corpus
        id_to_word: word index to word mapping
        word_to_id: word to word index mapping
    """
    words = standardize(corpus).split()
    vocabulary = ['UNK'] + list(set(words))
    id_to_word: Dict[int, str] = dict(enumerate(vocabulary))
    word_to_id: Dict[str, int] = dict(zip(id_to_word.values(), id_to_word.keys()))

    return words, vocabulary, id_to_word, word_to_id


def text_to_sequence(
        corpus,
        word_to_id: dict
) -> List[str]:
    """Generate integer sequence word
    Args:
        corpus: A string including sentences to process.
        word_to_id: word to integer index mapping
    Returns:
        sequence:
            word indices to every word in the originlal corpus as as they appear in it.
            The objective of sequence is to preserve the original corpus but as numerical indices.
    """
    return [word_to_id.get(w, 0) for w in standardize(corpus).split()]


In [35]:
corpus = """
The ruler of Britain, enters his throne room and announces his plan to 
divide the kingdom among his three daughters. He intends to give up the 
responsibilities of government and spend his old age visiting his children. 
He commands his daughters to say which of them loves him the most, 
promising to give the greatest share to that daughter."""

words, vocabulary, id_to_word, word_to_id = word_indexing(corpus)
print(vocabulary)
text_to_sequence(corpus, word_to_id)

['UNK', 'announces', 'promising', 'greatest', 'room', 'up', 'children', 'his', 'visiting', 'commands', 'government', 'kingdom', 'throne', 'responsibilities', 'britain', 'the', 'share', 'daughters', 'him', 'them', 'plan', 'loves', 'spend', 'among', 'to', 'say', 'divide', 'most', 'and', 'give', 'old', 'that', 'enters', 'intends', 'ruler', 'of', 'daughter', 'he', 'age', 'three', 'which']


[15,
 34,
 35,
 14,
 32,
 7,
 12,
 4,
 28,
 1,
 7,
 20,
 24,
 26,
 15,
 11,
 23,
 7,
 39,
 17,
 37,
 33,
 24,
 29,
 5,
 15,
 13,
 35,
 10,
 28,
 22,
 7,
 30,
 38,
 8,
 7,
 6,
 37,
 9,
 7,
 17,
 24,
 25,
 40,
 35,
 19,
 21,
 18,
 15,
 27,
 2,
 24,
 29,
 15,
 3,
 16,
 24,
 31,
 36]

# Keras TextVectorization layer
Use TextVectorization to create word indices to the words in a corpus.

> transforms a batch of strings into either 
> * a list of token indices (one sample = 1D tensor of integer token indices) or 
> * a dense representation (one sample = 1D tensor of float values representing data about the sample's 


* [TextVectorization layer](https://keras.io/api/layers/preprocessing_layers/core_preprocessing_layers/text_vectorization/)

```
tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=None,
    output_mode="int",
    output_sequence_length=None,
    pad_to_max_tokens=True,
    vocabulary=None,
    **kwargs
)
```

1. standardize each sample (usually lowercasing + punctuation stripping) 
2. split each sample into substrings (usually words) 
3. recombine substrings into tokens (usually ngrams) 
4. index tokens (associate a unique int value with each token) 
5. transform each sample using this index, either into a vector of ints or a dense float vector.


## Standardization

lowercasing + punctuation stripping by default as ```standardize="lower_and_strip_punctuation"```.

## Output modes
### Integer index

Index to the word position placed in a sequence.

In [10]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
corpus = np.array([
    ["This is the 1st sample."], 
    ["And here's the 2nd sample."]
])
print(f"corpus is \n{corpus}\n")

# --------------------------------------------------------------------------------
# Fit to the words in the corpus
# --------------------------------------------------------------------------------
vectorizer = TextVectorization(
    output_mode="int",
    ngrams=None           # 1 word = 1 token
)
vectorizer.adapt(corpus)

# --------------------------------------------------------------------------------
# Indices to the words
# --------------------------------------------------------------------------------
word_indices = vectorizer(corpus)
word_indices = tf.cast(word_indices, dtype=tf.int32)

print(f"vocabulary:{vectorizer.get_vocabulary()}\n")
print(f"word index sequence of the corpus:\n{word_indices}\n")

# --------------------------------------------------------------------------------
# Invert the indices to words for the first sentence
# --------------------------------------------------------------------------------
index_to_word = tf.reshape(tf.constant(vectorizer.get_vocabulary()), (-1, 1))
print(f"index_to_word:\n{index_to_word}\n")

first_sentence_indeces = word_indices[0][:, tf.newaxis]
print(
    "First sentence in the corpus is %s" 
    % tf.gather_nd(index_to_word, indices=first_sentence_indeces)
)


corpus is 
[['This is the 1st sample.']
 ["And here's the 2nd sample."]]

vocabulary:['', '[UNK]', 'the', 'sample', 'this', 'is', 'heres', 'and', '2nd', '1st']

word index sequence of the corpus:
[[4 5 2 9 3]
 [7 6 2 8 3]]

index_to_word:
[[b'']
 [b'[UNK]']
 [b'the']
 [b'sample']
 [b'this']
 [b'is']
 [b'heres']
 [b'and']
 [b'2nd']
 [b'1st']]

First sentence in the corpus is tf.Tensor(
[[b'this']
 [b'is']
 [b'the']
 [b'1st']
 [b'sample']], shape=(5, 1), dtype=string)


### Integer index to n-gram

Token is up-to N grams, e.g. 

Example: for a corpus "I am a cat" where N=2, both 1-gram and 2-grams will be tokens. Indices include both to the 1-gram and 2-gram tokens.

* 1-gram tokens = ("i", "am", "a", "cat")
* 2-gram tokens = ("i am", "am a", "a cat")

In [11]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
corpus = np.array([
    ["This is the 1st sample."], 
    ["And here's the 2nd sample."]
])
print(f"corpus is \n{corpus}\n")

# --------------------------------------------------------------------------------
# Fit to the words in the corpus
# --------------------------------------------------------------------------------
vectorizer = TextVectorization(
    output_mode="int", 
    ngrams=2             # 1 token == 1-gram or 2-grams
)
vectorizer.adapt(corpus)

# --------------------------------------------------------------------------------
# Indices to the words
# --------------------------------------------------------------------------------
word_indices = vectorizer(corpus)
word_indices = tf.cast(word_indices, dtype=tf.int32)

print(f"vocabulary:{vectorizer.get_vocabulary()}\n")
print(f"word index sequence of the corpus:\n{word_indices}\n")

# --------------------------------------------------------------------------------
# Invert the indices to words for the first sentence
# --------------------------------------------------------------------------------
index_to_word = tf.reshape(tf.constant(vectorizer.get_vocabulary()), (-1, 1))
print(f"index_to_word:\n{index_to_word}\n")

first_sentence_indeces = word_indices[0][:, tf.newaxis]
print(
    "First sentence in the corpus is %s" 
    % tf.gather_nd(index_to_word, indices=first_sentence_indeces)
)


corpus is 
[['This is the 1st sample.']
 ["And here's the 2nd sample."]]

vocabulary:['', '[UNK]', 'the', 'sample', 'this is', 'this', 'the 2nd', 'the 1st', 'is the', 'is', 'heres the', 'heres', 'and heres', 'and', '2nd sample', '2nd', '1st sample', '1st']

word index sequence of the corpus:
[[ 5  9  2 17  3  4  8  7 16]
 [13 11  2 15  3 12 10  6 14]]

index_to_word:
[[b'']
 [b'[UNK]']
 [b'the']
 [b'sample']
 [b'this is']
 [b'this']
 [b'the 2nd']
 [b'the 1st']
 [b'is the']
 [b'is']
 [b'heres the']
 [b'heres']
 [b'and heres']
 [b'and']
 [b'2nd sample']
 [b'2nd']
 [b'1st sample']
 [b'1st']]

First sentence in the corpus is tf.Tensor(
[[b'this']
 [b'is']
 [b'the']
 [b'1st']
 [b'sample']
 [b'this is']
 [b'is the']
 [b'the 1st']
 [b'1st sample']], shape=(9, 1), dtype=string)


### OHE to 2-grams 

In [14]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

corpus = np.array([["This is the 1st sample."], ["And here's the 2nd sample."]])
vectorizer = TextVectorization(output_mode="binary", ngrams=2)
vectorizer.adapt(corpus)

# --------------------------------------------------------------------------------
# OHE bit-map to tell if the corresponding token is in the sentence.
# --------------------------------------------------------------------------------
# [0, 1, 1, ... ] means the sentence includes the 1st and 2nd but not 0th token.
# 0-th: [UNK]
# 1st: 'the'
# 2nd: 'sample'
# --------------------------------------------------------------------------------
sequence = vectorizer(corpus)
print(f"integer encoded corpus:\n{sequence}\n")
print(f"vocabulary:\n{vectorizer.get_vocabulary()}")

integer encoded corpus:
[[0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0.]]

vocabulary:
['[UNK]', 'the', 'sample', 'this is', 'this', 'the 2nd', 'the 1st', 'is the', 'is', 'heres the', 'heres', 'and heres', 'and', '2nd sample', '2nd', '1st sample', '1st']
