In [1]:
import numpy as np

In [2]:
samples = ['The cat sat on the mat', 'The dog ate the homework', 'The flight to Denver has a cat and a dog']

In [3]:
#Create dictionary of tokens and token indices
tokens = {}
for sentence in samples:
    for word in sentence.split():
        if word not in tokens:
            tokens[word] = len(tokens) + 1 # The all zero vector is for words not in dictionary
            print('{word}:{idx}'.format(word=word, idx=tokens[word]))

The:1
cat:2
sat:3
on:4
the:5
mat:6
dog:7
ate:8
homework:9
flight:10
to:11
Denver:12
has:13
a:14
and:15


In [4]:
# One-hot encode
max_words = 20  #Only encode this many words of each sentence

one_hot_encoding = np.zeros(shape=(len(samples), max_words, len(tokens) + 1))
for sentence_idx, sentence in enumerate(samples):
    for word_idx, word in enumerate(sentence.split()[:max_words]):
        one_hot_encoding[sentence_idx, word_idx, tokens[word]] = 1

In [5]:
idx_to_token = {idx:token for token, idx in tokens.items()}
idx_to_token[0] = ''

In [6]:
#Convert from one-hot encoding to sentence
for sentence_encoding in one_hot_encoding:
    sentence_list = []
    for word_encoding in sentence_encoding: 
        try:
            word_idx = word_encoding.nonzero()[0][0]
            sentence_list.append(idx_to_token[word_idx])
        except IndexError:
            # zero-vector not really useful.
            pass
    print(' '.join(sentence_list))

The cat sat on the mat
The dog ate the homework
The flight to Denver has a cat and a dog


### Using Keras for One-Hot Encoding

In [7]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [8]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [9]:
tokenizer = Tokenizer(num_words = 20) #Create tokenizer with 1000 words
tokenizer.fit_on_texts(samples)

In [10]:
# Convert text into list of indices
tokenizer.texts_to_sequences(samples)

[[1, 2, 5, 6, 1, 7], [1, 3, 8, 1, 9], [1, 10, 11, 12, 13, 4, 2, 14, 4, 3]]

In [11]:
# One hot encoding
tokenizer.texts_to_matrix(samples)

array([[0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0.,
        0., 0., 0., 0.]])

In [12]:
tokenizer.word_index

{'a': 4,
 'and': 14,
 'ate': 8,
 'cat': 2,
 'denver': 12,
 'dog': 3,
 'flight': 10,
 'has': 13,
 'homework': 9,
 'mat': 7,
 'on': 6,
 'sat': 5,
 'the': 1,
 'to': 11}