In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

In [0]:
# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [0]:
tf.__version__

'1.14.0'

In [0]:
tf.keras.__version__

'2.2.4-tf'

In [0]:
import imdb

In [0]:
imdb.maybe_download_and_extract()

- Download progress: 100.0%
Download finished. Extracting files.
Done.


In [0]:
x_train_text, y_train = imdb.load_data(train=True)
x_test_text, y_test = imdb.load_data(train=False)

In [0]:
print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

Train-set size:  25000
Test-set size:   25000


In [0]:
data_text = x_train_text + x_test_text

In [0]:
x_train_text[1]

"If you cannot enjoy a chick flick, stop right now. If, however, you enjoy films that illustrate complex characters and provide extraordinary acting, read on.<br /><br />Ann Grant Lord is dying. Her two daughters arrive to be at her bedside. Ann begins talking about people from her past of whom the daughters are unaware, and they question as to whether these lost acquaintances are real or imagined. They come to realize that these people from their mother's past are, indeed, real.<br /><br />The story shifts, basically, between 1953 and circa 2000 with a few glimpses at Ann's life between those years. It was in 1953 that Ann met the love of her life and experienced her life's greatest tragedy.<br /><br />One of Ann's two best friends from college, Lila, is being married. Ann's other best friend is Lila's brother, Buddy. Lila and Buddy are the children of a rich Newport family, whereas Ann is a cabaret singer living in Greenwich Village who wants to be a free spirit but is still bound by

In [0]:
y_train[1]

1.0

In [0]:
num_words = 10000

In [0]:
tokenizer = Tokenizer(num_words=num_words)

In [0]:
%%time
tokenizer.fit_on_texts(data_text)

CPU times: user 11.7 s, sys: 94.1 ms, total: 11.8 s
Wall time: 11.8 s


In [0]:
if num_words is None:
    num_words = len(tokenizer.word_index)

In [0]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [0]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)

In [0]:
x_train_text[1]

"If you cannot enjoy a chick flick, stop right now. If, however, you enjoy films that illustrate complex characters and provide extraordinary acting, read on.<br /><br />Ann Grant Lord is dying. Her two daughters arrive to be at her bedside. Ann begins talking about people from her past of whom the daughters are unaware, and they question as to whether these lost acquaintances are real or imagined. They come to realize that these people from their mother's past are, indeed, real.<br /><br />The story shifts, basically, between 1953 and circa 2000 with a few glimpses at Ann's life between those years. It was in 1953 that Ann met the love of her life and experienced her life's greatest tragedy.<br /><br />One of Ann's two best friends from college, Lila, is being married. Ann's other best friend is Lila's brother, Buddy. Lila and Buddy are the children of a rich Newport family, whereas Ann is a cabaret singer living in Greenwich Village who wants to be a free spirit but is still bound by

In [0]:
np.array(x_train_tokens[1])

array([  43,   22,  576,  354,    3, 2164,  493,  542,  203,  146,   43,
        188,   22,  354,  104,   12, 9015, 1300,  102,    2, 1688, 2499,
        113,  339,   20,    7,    7, 2137, 2567, 1779,    6, 1685,   40,
        105, 3082, 3677,    5,   26,   30,   40, 2137,  836,  681,   42,
         83,   36,   40,  509,    4,  922,    1, 3082,   23, 5147,    2,
         33,  907,   14,    5,  723,  132,  432,   23,  144,   38, 4033,
         33,  215,    5,  959,   12,  132,   83,   36,   65, 4233,  509,
         23,  864,  144,    7,    7,    1,   64, 6594,  688,  201, 7364,
          2, 8089, 3359,   16,    3,  171, 6953,   30,  114,  201,  143,
        153,    9,   13,    8, 7364,   12, 2137, 1819,    1,  112,    4,
         40,  114,    2, 2339,   40, 6514,  811, 1637,    7,    7,   27,
          4,  105,  116,  350,   36, 1121,    6,  109, 1062,   79,  116,
        444,    6,  626, 1867,    2, 1867,   23,    1,  466,    4,    3,
        986,  236, 2962, 2137,    6,    3, 1843,  5

In [0]:
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)


In [0]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [0]:
np.mean(num_tokens)


221.27716

In [0]:
np.max(num_tokens)


2208

In [0]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

544

In [0]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.94532

In [0]:
pad = 'pre'

In [0]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

In [0]:
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [0]:
x_train_pad.shape

(25000, 544)

In [0]:
x_test_pad.shape


(25000, 544)

In [0]:
np.array(x_train_tokens[1])

array([  43,   22,  576,  354,    3, 2164,  493,  542,  203,  146,   43,
        188,   22,  354,  104,   12, 9015, 1300,  102,    2, 1688, 2499,
        113,  339,   20,    7,    7, 2137, 2567, 1779,    6, 1685,   40,
        105, 3082, 3677,    5,   26,   30,   40, 2137,  836,  681,   42,
         83,   36,   40,  509,    4,  922,    1, 3082,   23, 5147,    2,
         33,  907,   14,    5,  723,  132,  432,   23,  144,   38, 4033,
         33,  215,    5,  959,   12,  132,   83,   36,   65, 4233,  509,
         23,  864,  144,    7,    7,    1,   64, 6594,  688,  201, 7364,
          2, 8089, 3359,   16,    3,  171, 6953,   30,  114,  201,  143,
        153,    9,   13,    8, 7364,   12, 2137, 1819,    1,  112,    4,
         40,  114,    2, 2339,   40, 6514,  811, 1637,    7,    7,   27,
          4,  105,  116,  350,   36, 1121,    6,  109, 1062,   79,  116,
        444,    6,  626, 1867,    2, 1867,   23,    1,  466,    4,    3,
        986,  236, 2962, 2137,    6,    3, 1843,  5

In [0]:
x_train_pad[1]

array([ 199,  114,    2, 5251,    6,  765,    5,  887,  264,    5,  390,
         53,   40, 1833,   14,  399,  416,    2,  325, 7365,  263,  211,
        436, 4848,  576, 4234,    3,  639,    7,    7, 9207,   29,    4,
        132, 1497,   82,    3, 6418,    4, 5030,  153,    2,   22,   76,
         32, 1723,  163,   30,  908,   92, 1196,    2,   92,  286,  686,
          1, 3281,    2, 1640,    4,    1, 1300,   83,  571,    7,    7,
         29,    4,    1,  113,    8, 2290,    6,  320,   18,   46,   23,
         47, 2499,  367,    2,  134,  340,   16,  105,  960,  236, 1497,
         12,   94,   11,   19,   34,   52,   52,  310,    7,    7, 3710,
       5878,  297,    1, 3922, 2137,    2,   59,  124,    9,    8,    3,
        396,   12,  700,  276,   32, 1105,  257,    4,  143,  209,   35,
          6, 8230,   31,   48,   59,    6,  442,    5,   77,   14, 3336,
          5,   48,   59,  487,    5,   77,   40,  241,    6,   21,  736,
       2508,    7,    7, 6732, 8459,  297,    1, 16

In [0]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [0]:
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text

In [0]:
x_train_text[1]

"If you cannot enjoy a chick flick, stop right now. If, however, you enjoy films that illustrate complex characters and provide extraordinary acting, read on.<br /><br />Ann Grant Lord is dying. Her two daughters arrive to be at her bedside. Ann begins talking about people from her past of whom the daughters are unaware, and they question as to whether these lost acquaintances are real or imagined. They come to realize that these people from their mother's past are, indeed, real.<br /><br />The story shifts, basically, between 1953 and circa 2000 with a few glimpses at Ann's life between those years. It was in 1953 that Ann met the love of her life and experienced her life's greatest tragedy.<br /><br />One of Ann's two best friends from college, Lila, is being married. Ann's other best friend is Lila's brother, Buddy. Lila and Buddy are the children of a rich Newport family, whereas Ann is a cabaret singer living in Greenwich Village who wants to be a free spirit but is still bound by

In [0]:
tokens_to_string(x_train_tokens[1])

"if you cannot enjoy a chick flick stop right now if however you enjoy films that illustrate complex characters and provide extraordinary acting read on br br ann grant lord is dying her two daughters arrive to be at her ann begins talking about people from her past of whom the daughters are unaware and they question as to whether these lost are real or imagined they come to realize that these people from their mother's past are indeed real br br the story shifts basically between 1953 and circa 2000 with a few glimpses at life between those years it was in 1953 that ann met the love of her life and experienced her life's greatest tragedy br br one of two best friends from college is being married other best friend is brother buddy and buddy are the children of a rich family whereas ann is a singer living in village who wants to be a free spirit but is still bound by many of those 1950's conventions br br soon after ann arrives to be maid of honor at wedding she meets the person who wi

In [0]:
model = Sequential()

In [0]:
embedding_size = 8

In [0]:
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

W0807 12:34:29.607911 140148265138048 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [0]:
model.add(GRU(units=16, return_sequences=True))

W0807 12:34:33.540786 140148265138048 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [0]:
model.add(GRU(units=8, return_sequences=True))

In [0]:
model.add(GRU(units=4))

In [0]:
model.add(Dense(1, activation='sigmoid'))

In [0]:
optimizer = Adam(lr=1e-3)

In [0]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

W0807 12:34:47.179125 140148265138048 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru (GRU)                    (None, 544, 16)           1200      
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 8)            600       
_________________________________________________________________
gru_2 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


In [0]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=4, batch_size=64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 35min 18s, sys: 2min 1s, total: 37min 20s
Wall time: 20min 11s


<tensorflow.python.keras.callbacks.History at 0x7f7690591e10>

In [0]:
%%time
result = model.evaluate(x_test_pad, y_test)

CPU times: user 1min 48s, sys: 1.64 s, total: 1min 49s
Wall time: 1min 33s


In [0]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 85.72%


In [0]:
%%time
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

NameError: ignored

In [0]:
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

In [0]:
cls_true = np.array(y_test[0:1000])

In [0]:
incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]

In [0]:
len(incorrect)

190

In [0]:
idx = incorrect[0]
idx

0

In [0]:
text = x_test_text[idx]
text

"I SELL THE DEAD (2009) **1/2 Dominic Monaghan, Larry Fessenden, Ron Perlman, Angus Scrimm, John Speredakos, Eileen Colgan, Brenda Cooney. Uneven blend of horror and comedy that poses as a valentine to the '60s horror films by Roger Corman and Hammer Studios, with two cretinous grave robbers (Monaghan and Fessenden) facing final punishments for the crimes via the guillotine but not before their tales of the occult can be recalled in flashbacks. Amusing and a few well sprinkled jolts but really a mess of a B-movie trying in vein to be a cult classic largely thanks to the casting of genre vets Perlman and Scrimm to no avail; a good rental for Halloween. (Dir: Glenn McQuaid)"

In [0]:
y_pred[idx]

0.037938207

In [0]:
cls_true[idx]

1.0

In [0]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [0]:
tokens = tokenizer.texts_to_sequences(texts)

In [0]:
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
tokens_pad.shape

(8, 544)

In [0]:
model.predict(tokens_pad)

array([[0.96067977],
       [0.89856577],
       [0.75594366],
       [0.84377855],
       [0.67702633],
       [0.42514688],
       [0.8552542 ],
       [0.25050125]], dtype=float32)

In [0]:
layer_embedding = model.get_layer('layer_embedding')

In [0]:
weights_embedding = layer_embedding.get_weights()[0]

In [0]:
weights_embedding.shape

(10000, 8)

In [0]:
token_good = tokenizer.word_index['good']
token_good

49

In [0]:
token_great = tokenizer.word_index['great']
token_great

78

In [0]:
weights_embedding[token_good]

array([ 0.06990868, -0.0635757 ,  0.00651114,  0.02910687,  0.02221848,
       -0.08204393, -0.04282869,  0.00865578], dtype=float32)

In [0]:
weights_embedding[token_great]

array([ 0.10235833, -0.1256574 ,  0.16474661,  0.11950813,  0.10953443,
       -0.15126067, -0.13183455,  0.12016096], dtype=float32)

In [0]:
token_bad = tokenizer.word_index['bad']
token_horrible = tokenizer.word_index['horrible']

In [0]:
weights_embedding[token_bad]

array([-0.15679997,  0.09429009, -0.13303772, -0.08758956, -0.06235645,
        0.08507401,  0.08585963, -0.1390407 ], dtype=float32)

In [0]:
weights_embedding[token_horrible]

array([-0.14237069,  0.20652135, -0.12281132, -0.1362023 , -0.16818562,
        0.17382565,  0.1984778 , -0.20665692], dtype=float32)

In [0]:
def print_sorted_words(word, metric='cosine'):
    """
    Print the words in the vocabulary sorted according to their
    embedding-distance to the given word.
    Different metrics can be used, e.g. 'cosine' or 'euclidean'.
    """

    # Get the token (i.e. integer ID) for the given word.
    token = tokenizer.word_index[word]

    # Get the embedding for the given word. Note that the
    # embedding-weight-matrix is indexed by the word-tokens
    # which are integer IDs.
    embedding = weights_embedding[token]

    # Calculate the distance between the embeddings for
    # this word and all other words in the vocabulary.
    distances = cdist(weights_embedding, [embedding],
                      metric=metric).T[0]
    
    # Get an index sorted according to the embedding-distances.
    # These are the tokens (integer IDs) for words in the vocabulary.
    sorted_index = np.argsort(distances)
    
    # Sort the embedding-distances.
    sorted_distances = distances[sorted_index]
    
    # Sort all the words in the vocabulary according to their
    # embedding-distance. This is a bit excessive because we
    # will only print the top and bottom words.
    sorted_words = [inverse_map[token] for token in sorted_index
                    if token != 0]

    # Helper-function for printing words and embedding-distances.
    def _print_words(words, distances):
        for word, distance in zip(words, distances):
            print("{0:.3f} - {1}".format(distance, word))

    # Number of words to print from the top and bottom of the list.
    k = 10

    print("Distance from '{0}':".format(word))

    # Print the words with smallest embedding-distance.
    _print_words(sorted_words[0:k], sorted_distances[0:k])

    print("...")

    # Print the words with highest embedding-distance.
    _print_words(sorted_words[-k:], sorted_distances[-k:])

In [0]:
print_sorted_words('great', metric='cosine')

NameError: ignored

In [0]:
print_sorted_words('worst', metric='cosine')

Distance from 'worst':
0.000 - worst
0.005 - stupidity
0.005 - boredom
0.005 - wasting
0.005 - annoying
0.005 - baldwin
0.006 - moronic
0.006 - conflict
0.006 - habits
0.006 - christians
...
1.992 - stress
1.992 - kubrick
1.992 - both
1.993 - adored
1.993 - realistic
1.993 - 8
1.995 - helps
1.995 - safety
1.995 - refreshing
1.997 - fortunate
