# TensorFlow Encoding Sentences

In [1]:
# importing packages
import tensorflow as tf
import nltk

from tensorflow import keras
from nltk import word_tokenize, sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Use the following document:

In [2]:
# defining text
document = "Apple today unveiled completely redesigned Everyone Can Code curriculum to help introduce more elementary and middle school students to the world of coding. Now available, the new curriculum includes even more resources for teachers, a brand new guide for students and updated Swift Coding Club materials. Today millions of students worldwide already use Everyone Can Code curriculum to bring their ideas to life and develop important skills including creativity, collaboration and problem solving. "
document

'Apple today unveiled completely redesigned Everyone Can Code curriculum to help introduce more elementary and middle school students to the world of coding. Now available, the new curriculum includes even more resources for teachers, a brand new guide for students and updated Swift Coding Club materials. Today millions of students worldwide already use Everyone Can Code curriculum to bring their ideas to life and develop important skills including creativity, collaboration and problem solving. '

In [3]:
# creating a list of the separate sentences
list_of_sentences = sent_tokenize(document)

In [4]:
list_of_sentences

['Apple today unveiled completely redesigned Everyone Can Code curriculum to help introduce more elementary and middle school students to the world of coding.',
 'Now available, the new curriculum includes even more resources for teachers, a brand new guide for students and updated Swift Coding Club materials.',
 'Today millions of students worldwide already use Everyone Can Code curriculum to bring their ideas to life and develop important skills including creativity, collaboration and problem solving.']

### Tokenize the sentences

In [5]:
# encodes the 100 most common words from the corpus of text
tokenizer = Tokenizer(oov_token = "<oov>")

# fit the Tokenizer to the data
# strips out punctuation and converts to lowercase
tokenizer.fit_on_texts(list_of_sentences)

###  Create a word index

In [6]:
# tokenizer creates a dictionary of key/value --> word/token pairs 
# word_index contains the dictionary
word_index = tokenizer.word_index
print(word_index)
print(len(word_index))

{'<oov>': 1, 'to': 2, 'and': 3, 'curriculum': 4, 'students': 5, 'today': 6, 'everyone': 7, 'can': 8, 'code': 9, 'more': 10, 'the': 11, 'of': 12, 'coding': 13, 'new': 14, 'for': 15, 'apple': 16, 'unveiled': 17, 'completely': 18, 'redesigned': 19, 'help': 20, 'introduce': 21, 'elementary': 22, 'middle': 23, 'school': 24, 'world': 25, 'now': 26, 'available': 27, 'includes': 28, 'even': 29, 'resources': 30, 'teachers': 31, 'a': 32, 'brand': 33, 'guide': 34, 'updated': 35, 'swift': 36, 'club': 37, 'materials': 38, 'millions': 39, 'worldwide': 40, 'already': 41, 'use': 42, 'bring': 43, 'their': 44, 'ideas': 45, 'life': 46, 'develop': 47, 'important': 48, 'skills': 49, 'including': 50, 'creativity': 51, 'collaboration': 52, 'problem': 53, 'solving': 54}
54


### Encode the sentences into sequences

In [7]:
# changing sentences into vectors (sequences)
sequences = tokenizer.texts_to_sequences(list_of_sentences)
print(sequences)

[[16, 6, 17, 18, 19, 7, 8, 9, 4, 2, 20, 21, 10, 22, 3, 23, 24, 5, 2, 11, 25, 12, 13], [26, 27, 11, 14, 4, 28, 29, 10, 30, 15, 31, 32, 33, 14, 34, 15, 5, 3, 35, 36, 13, 37, 38], [6, 39, 12, 5, 40, 41, 42, 7, 8, 9, 4, 2, 43, 44, 45, 2, 46, 3, 47, 48, 49, 50, 51, 52, 3, 53, 54]]


###  Add padding to the sequences so that they are of equal length

In [8]:
# adding padding to create a matrix of equal-sized word vectors
# padding is "pre" by default
padded = pad_sequences(sequences, padding="post")
print(padded)

[[16  6 17 18 19  7  8  9  4  2 20 21 10 22  3 23 24  5  2 11 25 12 13  0
   0  0  0]
 [26 27 11 14  4 28 29 10 30 15 31 32 33 14 34 15  5  3 35 36 13 37 38  0
   0  0  0]
 [ 6 39 12  5 40 41 42  7  8  9  4  2 43 44 45  2 46  3 47 48 49 50 51 52
   3 53 54]]


### Provide your own Apple-related sentence, and encode it into a sequence

In [9]:
# "oov" (out of vocabulary) words are assigned a 1
my_sentence = "Apple is an incredibly creative company with millions teachers and students using their products."
tokenizer.texts_to_sequences([my_sentence])

[[16, 1, 1, 1, 1, 1, 1, 39, 31, 3, 5, 1, 44, 1]]