# TensorFlow - Tokenizing Review

In [1]:
# importing packages
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Get data

In [2]:
# reading and processing data 
yelp_ratings = pd.read_csv("./data/yelp_ratings.csv") #44530 samples
yelp_reviews = yelp_ratings["text"].values
yelp_reviews_labels = yelp_ratings["sentiment"].values
yelp_reviews_stars = yelp_ratings["stars"].values

## Tokenize sentences

In [3]:
# encoding the 1000 most common words from the corpus of text
tokenizer = Tokenizer(num_words = 1000, oov_token = "<oov>")

# fitting the Tokenizer to the data - strips out punctuation and converts to lowercase
tokenizer.fit_on_texts(yelp_reviews)

## View the word index

In [4]:
# creating a dictionary (word_index) of key/value using tokenizer - word/token pairs 
word_index = tokenizer.word_index
print(word_index)



## Generate sequences

In [5]:
# changing sentences into vectors (sequences)
sequences = tokenizer.texts_to_sequences(yelp_reviews)

## Sample text

In [6]:
# checking a sample
yelp_reviews[10]

"Wow. So surprised at the one and two star reviews!  We started with the most tender calamari. Although the marinara sauce was a bit bland, but a touch of salt made it just right. My husband had the veal with peppers and said it was so delicious and tender. The mashed potatoes were perfect. I had the salmon Diablo which was also delicious. Our salad was beautiful! Dressing was served on the salad and it was a nice amount. We ended our delicious meal with a piece of tiramisu. Our server Matt was right on!! Very pleasant and knowledgeable about the menu. Our appetizer, salad and entrees were timed perfectly. I love salad and did not mind that my entree was served while I was still eating it! No problem it let my dinner cool to just the right temp for me to eat it comfortably. \nI wonder sometimes if people just don't appreciate relaxing and taking time to eat a wonderful and beautifully prepared meal.  A wonderful atmosphere. So relaxing. The chairs are super comfortable too!!! We will c

## Text to sequence

In [7]:
# printing sequences
print(sequences[10])

[638, 25, 688, 27, 2, 48, 3, 135, 406, 345, 18, 458, 14, 2, 177, 788, 1, 536, 2, 1, 183, 7, 5, 212, 830, 21, 5, 958, 8, 974, 118, 11, 47, 138, 13, 321, 23, 2, 1, 14, 1, 3, 128, 11, 7, 25, 115, 3, 788, 2, 1, 677, 28, 250, 4, 23, 2, 730, 1, 68, 7, 69, 115, 43, 208, 7, 466, 1, 7, 407, 20, 2, 208, 3, 11, 7, 5, 86, 552, 18, 543, 43, 115, 213, 14, 5, 857, 8, 1, 43, 245, 1, 7, 138, 20, 36, 720, 3, 660, 61, 2, 129, 43, 683, 208, 3, 1, 28, 1, 537, 4, 90, 208, 3, 94, 24, 563, 16, 13, 1, 7, 407, 163, 4, 7, 157, 429, 11, 67, 465, 11, 317, 13, 218, 411, 6, 47, 2, 138, 1, 10, 32, 6, 175, 11, 1, 4, 1, 742, 41, 117, 47, 84, 1, 1, 3, 615, 44, 6, 175, 5, 354, 3, 1, 728, 213, 5, 354, 266, 25, 1, 2, 1, 26, 191, 579, 91, 18, 58, 1, 31, 51, 168, 11, 5, 113, 84, 88, 59, 76, 2, 345, 5, 1, 8, 1, 1, 1, 135, 115, 1, 3, 444, 10, 1, 82, 14, 11]


## Add padding

In [8]:
# adding padding to create a matrix of equal-sized word vectors - padding is "pre" by default
padded = pad_sequences(sequences, padding="post")

print(padded[10])

[638  25 688 ...   0   0   0]


In [9]:
# checking shape
padded.shape

(44530, 1003)