In [48]:
import tensorflow as tf
import tensorflow_datasets as tfds 
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Import data

In [49]:
imdb, info = tfds.load("imdb_reviews",  as_supervised = True, with_info = True) 
# imdb is a dict containing three different <tf.data.Dataset>,
# each of which contain two tensors in the default format of (tensor containing the input, tensor containing the label)
imdb

{'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [50]:
print(info.description)
print(info.features)
print(info.splits)

Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
FeaturesDict({
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'text': Text(shape=(), dtype=tf.string),
})
{'test': <tfds.core.SplitInfo num_examples=25000>, 'train': <tfds.core.SplitInfo num_examples=25000>, 'unsupervised': <tfds.core.SplitInfo num_examples=50000>}


* Each of the 25000 records in the <tf.data.Dataset> (for the train set and test set) is stored as a FeaturesDict. 
* The FeaturesDict consists of a **string tensor** called "text" (containing the review) and an **integer tensor** called "label"(containing the label). 
    * Refer to https://www.tensorflow.org/guide/tensor info about string tensors in the FeaturesDict
* We need to convert the string tensor and the integer tensor of each record to a np array.

In [60]:
train_data, test_data = imdb['train'], imdb['test']

reviews_train = []
labels_train = []
reviews_test = []
labels_test = []

for review, label in train_data:

    reviews_train.append(review.numpy().decode("utf8"))
    # Tensors are explicitly converted to np arrays using their .numpy() method.
    # review.numpy() is b"This was an absolutely terrible movie. Don't ..." , of <class 'bytes'>

    # A string is a sequence of characters, i.e. unicode symbols that can't be directly stored on disk. 
    # A byte string is a sequence of bytes - things that can be stored on disk. 
    # The mapping between bytes and unicode symbols is an encoding.
    # There are many different types of encodings. 
    # We need to convert the byte string to an actual string, using the decode() functoin of the byte string.
    labels_train.append(label.numpy())

for review, label in test_data:
    reviews_test.append(review.numpy().decode("utf8"))
    labels_test.append(label.numpy())

In [52]:
print(reviews_train[0])

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.


In [64]:
print(type(labels_train[0]))

<class 'numpy.int64'>


In [65]:
# Must convert the list of integers (labels) to np arrays,
# because the input the the model must be np arrays.

labels_train = np.array(labels_train)
labels_test = np.array(labels_test)
# We do not have to explicitly convert the list of strings (reviews) to np arrays, 
# because the Tokenizer (which we will do below) takes in a list of strings and outputs a np array

In [66]:
labels_test

array([1, 1, 0, ..., 0, 1, 1])

# Tokenize words and vectorize sentences

In [53]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "UNK" # "UNK" if the word is not part of the vocab_size words

In [54]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token= oov_tok)
tokenizer.fit_on_texts(reviews_train)
word_index = tokenizer.word_index
seqs_train = tokenizer.texts_to_sequences(reviews_train)
padded_train = pad_sequences(seqs_train, maxlen=max_length, truncating=trunc_type)

seqs_test = tokenizer.texts_to_sequences(reviews_test) # tokenized based on the word_index that was fit on the training words; thus will have oovs
padded_test = pad_sequences(seqs_test, maxlen=max_length)

# Create model
## Create a 16-dimensional embedding, for each token (i.e. each tokenized word) in each padded sequence (i.e. each vectorized sentence)

In [59]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), 
    # must specifiy input_length, 
    # i.e. the length of each sequence that is fed to the model. 
    # In this case, the length of each sequence is 120 (since we used padding with maxlen=max_length=120)
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation = "relu"),
    tf.keras.layers.Dense(1, activation = "sigmoid")
    # Just one output node, with activation "sigmoid" to push to 0 or 1, corresponding to the two classes (positive or negative)
])
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics =["accuracy"])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 11526     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


# Train

In [81]:
labels_train.shape

(25000,)

In [67]:
num_epochs = 10
model.fit(padded_train, labels_train, epochs = num_epochs, validation_data=(padded_test, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x163cdb550>

## The weights of the embedding layer are a lookup table, where each row is the 16-dimensional embedding of each of the 10000 words in our word_index

In [68]:
embedder = model.layers[0]
print(embedder)

<tensorflow.python.keras.layers.embeddings.Embedding object at 0x1691615d0>


In [69]:
embedder.get_weights()

[array([[ 0.02279906,  0.00532914, -0.04177454, ...,  0.03878019,
         -0.02602139, -0.00987016],
        [-0.00317021,  0.02098193,  0.01750253, ...,  0.13743086,
         -0.09832368,  0.00272052],
        [ 0.00336432,  0.0446123 , -0.01018012, ...,  0.17973503,
         -0.14524712, -0.02155509],
        ...,
        [-0.06268571, -0.00610817,  0.01784846, ..., -0.0132672 ,
          0.07257691, -0.1105175 ],
        [ 0.05336674,  0.10014763, -0.11606579, ...,  0.07198753,
         -0.03517614, -0.02291496],
        [-0.01840573,  0.02778517, -0.02546399, ...,  0.13565573,
          0.17252141, -0.05067727]], dtype=float32)]

In [71]:
embedder.get_weights()[0].shape 

(10000, 16)

In [74]:
embedder.input_shape

(None, 120)

In [73]:
embedder.output_shape

(None, 120, 16)

# Visualize word embeddings

In [80]:
reverse_word_index = dict([(value,key) for key,value in word_index.items()])
def decode_review(paddedseq):
    return " ".join([reverse_word_index.get(token, "?") for token in paddedseq]) 
    # Rather than dict[key], dict.get(key) lets us return "?" if a certain key is missing.
    # THus, all the zeros in the paddedseq (i.e. the pad characters) will be "?"
print(reviews_train[0])
print(padded_train[0])
print(decode_review(padded_train[0]))

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
[   0    0    0   12   14   33  425  392   18   90   28    1    9   32
 1366 3585   40  486    1  197   24   85  154   19   12  213  329   28
   66  247  215    9  477   58   66   85  114   98   22 5675   12 1322
  643  767   12   18    7   33  400 8170  176 2455  416    2   89 1231
  137 

In [76]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]

  embeddings = embedder.get_weights()[0][word_num] 
  # lookup table - visit the row of the table which corresponds to the vector of that word
  
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [77]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

Use the tsv files in the tensorflow embedding projector to see the word embeddings in 3D embedding space
https://projector.tensorflow.org/