In [1]:
import string
import numpy as np
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
SAMPLES = ['The cat sat on the mat.', 'The dog ate my homework.']

## 1. Word-level one-hot encoding toy example

In [3]:
token_index = {}
for sample in SAMPLES:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1

print("token_index = {}".format(token_index))

token_index = {'The': 1, 'cat': 2, 'sat': 3, 'on': 4, 'the': 5, 'mat.': 6, 'dog': 7, 'ate': 8, 'my': 9, 'homework.': 10}


In [4]:
max_length = 10
results = np.zeros((len(SAMPLES), max_length, max(token_index.values()) + 1))

for i, sample in enumerate(SAMPLES):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1

print("results.shape = {}".format(results.shape))
print("results = \n{}".format(results))

results.shape = (2, 10, 11)
results = 
[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


## 2. Character-level one-hot encoding toy example

In [5]:
chars = string.printable
token_index = dict(zip(chars, range(1, len(chars) + 1)))
print("token_index = {}".format(token_index))

token_index = {'0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'a': 11, 'b': 12, 'c': 13, 'd': 14, 'e': 15, 'f': 16, 'g': 17, 'h': 18, 'i': 19, 'j': 20, 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'o': 25, 'p': 26, 'q': 27, 'r': 28, 's': 29, 't': 30, 'u': 31, 'v': 32, 'w': 33, 'x': 34, 'y': 35, 'z': 36, 'A': 37, 'B': 38, 'C': 39, 'D': 40, 'E': 41, 'F': 42, 'G': 43, 'H': 44, 'I': 45, 'J': 46, 'K': 47, 'L': 48, 'M': 49, 'N': 50, 'O': 51, 'P': 52, 'Q': 53, 'R': 54, 'S': 55, 'T': 56, 'U': 57, 'V': 58, 'W': 59, 'X': 60, 'Y': 61, 'Z': 62, '!': 63, '"': 64, '#': 65, '$': 66, '%': 67, '&': 68, "'": 69, '(': 70, ')': 71, '*': 72, '+': 73, ',': 74, '-': 75, '.': 76, '/': 77, ':': 78, ';': 79, '<': 80, '=': 81, '>': 82, '?': 83, '@': 84, '[': 85, '\\': 86, ']': 87, '^': 88, '_': 89, '`': 90, '{': 91, '|': 92, '}': 93, '~': 94, ' ': 95, '\t': 96, '\n': 97, '\r': 98, '\x0b': 99, '\x0c': 100}


In [6]:
max_length = 50
results = np.zeros((len(SAMPLES), max_length, max(token_index.values()) + 1))

for i, sample in enumerate(SAMPLES):
    for j, char in list(enumerate(sample))[:max_length]:
        index = token_index.get(char)
        results[i, j, index] = 1
        
print("results.shape = {}".format(results.shape))
print("results = \n{}".format(results))
print("first sample indices = {}".format(np.argmax(results[0], axis=1)))

results.shape = (2, 50, 101)
results = 
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
first sample indices = [56 18 15 95 13 11 30 95 29 11 30 95 25 24 95 30 18 15 95 23 11 30 76  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


## 3. Word-level one-hot encoding with Keras

In [7]:
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(SAMPLES)
print("word_index = {}".format(tokenizer.word_index))

word_index = {'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}


In [8]:
seqs = tokenizer.texts_to_sequences(SAMPLES)
print("seqs = {}".format(seqs))

seqs = [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]


In [9]:
one_hot = tokenizer.texts_to_matrix(SAMPLES, mode='binary')
print("one_hot = \n{}".format(one_hot))

one_hot = 
[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


## 4.  Word-level one-hot encoding with hashing

In [10]:
dims = 1000
max_length = 10
results = np.zeros((len(SAMPLES), max_length, dims))

for i, sample in enumerate(SAMPLES):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dims
        results[i, j, index] = 1

print("results.shape = {}".format(results.shape))
print("results = \n{}".format(results))
print("first sample indices = {}".format(np.argmax(results[0], axis=1)))

results.shape = (2, 10, 1000)
results = 
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
first sample indices = [708 723 557 445  68 677   0   0   0   0]
