# Processing Language

## Defining understanding as tokenization

In [1]:
import numpy as np

texts = ["My dog gets along with cats", 
         "That cat is vicious",
         "My dog is happy when it is lunch"]

## Putting all the documents into a bag

In [2]:
unique_words = set(word.lower() for phrase in texts for 
                   word in phrase.split(" "))
print(f"There are {len(unique_words)} unique words")

There are 14 unique words


In [3]:
from keras.preprocessing.text import Tokenizer
vocabulary_size = len(unique_words) + 1
tokenizer = Tokenizer(num_words=vocabulary_size)

Using TensorFlow backend.


In [4]:
tokenizer.fit_on_texts(texts)
print(tokenizer.index_word)

{1: 'is', 2: 'my', 3: 'dog', 4: 'gets', 5: 'along', 6: 'with', 7: 'cats', 8: 'that', 9: 'cat', 10: 'vicious', 11: 'happy', 12: 'when', 13: 'it', 14: 'lunch'}


In [5]:
print(tokenizer.texts_to_matrix(texts))

[[0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]]


In [6]:
print(np.round(tokenizer.texts_to_matrix(texts, mode='tfidf'), 1))

[[0.  0.  0.7 0.7 0.9 0.9 0.9 0.9 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.7 0.  0.  0.  0.  0.  0.  0.9 0.9 0.9 0.  0.  0.  0. ]
 [0.  1.2 0.7 0.7 0.  0.  0.  0.  0.  0.  0.  0.9 0.9 0.9 0.9]]


In [7]:
print(tokenizer.texts_to_sequences(texts))

[[2, 3, 4, 5, 6, 7], [8, 9, 1, 10], [2, 3, 1, 11, 12, 13, 1, 14]]
