<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/deeplearning.ai/tf/c3_w2_word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Visualize word embeddings

In [6]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tf.__version__, tfds.__version__, np.__version__

('2.3.0', '4.0.1', '1.18.5')

In [2]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete5B8101/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete5B8101/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete5B8101/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

for s, l in train_data:
  training_sentences.append(s.numpy().decode('utf8'))
  training_labels.append(l.numpy())

for s, l in test_data:
  testing_sentences.append(s.numpy().decode('utf8'))
  testing_labels.append(l.numpy())

training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

len(training_sentences), training_labels.shape

(25000, (25000,))

In [4]:
training_labels[:5]

array([0, 0, 0, 1, 1])

In [5]:
training_sentences[0]

"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."

In [7]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<oov>'

In [11]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
[t for t in list(word_index.items())[:5]]

[('<oov>', 1), ('the', 2), ('and', 3), ('a', 4), ('of', 5)]

In [20]:
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

type(sequences), len(sequences), sequences[0][:5], len(padded[0]), padded[0][-5:]

(list,
 25000,
 [12, 14, 33, 425, 392],
 120,
 array([  98, 1197,  867,  141,   10], dtype=int32))

In [23]:
testing_sentences

[[48,
  24,
  106,
  13,
  95,
  4066,
  16,
  740,
  5065,
  10,
  14,
  312,
  5,
  2,
  579,
  349,
  16,
  1847,
  1257,
  1,
  16,
  668,
  7666,
  5531,
  1,
  761,
  6,
  13,
  1026,
  1,
  1,
  425,
  478,
  1,
  4,
  1,
  327,
  3560,
  20,
  229,
  3,
  15,
  5742,
  3,
  15,
  1620,
  15,
  99,
  5,
  2,
  3550,
  100,
  11,
  772,
  1498,
  12,
  252,
  235,
  11,
  217,
  2,
  366,
  6454,
  3,
  58,
  93,
  11,
  90,
  102,
  11,
  1498,
  177,
  12,
  252,
  36,
  6,
  1126,
  1,
  674,
  7,
  4387,
  1,
  4,
  1,
  327,
  7,
  36,
  8300,
  366,
  5,
  1403,
  1,
  13,
  29,
  60,
  26,
  6,
  867,
  178,
  17,
  4,
  1037,
  5,
  12,
  227,
  3,
  79,
  4,
  345,
  32,
  345,
  5159,
  5,
  10,
  6,
  1314,
  1143,
  2,
  5619,
  1,
  3,
  1,
  5,
  10,
  173,
  322,
  7,
  1293,
  3938,
  4,
  788,
  1909,
  5,
  4,
  250,
  2673,
  165,
  3,
  2,
  352,
  30,
  185,
  24,
  1154,
  223,
  599,
  5,
  2,
  118,
  2,
  348,
  1382,
  7675,
  29,
  1,
  871,
  37,
  4,


In [24]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sentences, maxlen=max_length, 
                               truncating=trunc_type)
len(testing_sequences), len(testing_padded[0]) 

AttributeError: ignored