# Download and Preview Data

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import numpy as np

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
# Custom python module
import imdb

imdb.maybe_download_and_extract()

- Download progress: 100.0%
Download finished. Extracting files.
Done.


In [7]:
x_train_text, y_train = imdb.load_data(train=True)
x_test_text, y_test = imdb.load_data(train=False)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [8]:
print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

Train-set size:  25000
Test-set size:   25000


In [9]:
data_text = x_train_text + x_test_text

In [10]:
x_train_text[1]

'I really love this movie, saw it again last week after 3 years or so. This movie is perfect, great acting, great story, great directing/camera-work/music. It is a gift to show it to someone you love. too bad jaco van dormael did not make more movies after this one. Top 5 work. Really!!<br /><br />Today, it\'s 3 years and 3 days later then the comment above. it was never posted because it was not more than 10 lines. Anyway, i saw "le huitieme jour" again yesterday. This is with no doubt in my movie top 3. together with "Cinema Paradiso" which is also a masterpiece. The soundtrack is also really good. I am really curious about "jaco von dormael\'s" new movie. I hope it will complete my movie top 3. If you see this movie, rent it. Or even better. buy it. Because you will want to see it again.'

# Preprocessing

In [11]:
# Tokenize text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data_text)

In [12]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

In [13]:
np.array(x_train_tokens[1])

array([  10,   62,  112,   11,   17,  210,    9,  172,  238, 1237,  100,
        338,  153,   38,   34,   11,   17,    6,  399,   78,  113,   78,
         64,   78,  975,  355,  158,  207,    9,    6,    3, 3746,    5,
        119,    9,    5,  296,   22,  112,   96,   74, 1079,  115,   21,
         94,   51,   97,  100,   11,   27,  342,  447,  158,   62,    7,
          7,  621,   44,  338,  153,    2,  338,  483,  305,   91,    1,
        929,  729,    9,   13,  110, 5370,   84,    9,   13,   21,   51,
         71,  156,  409,  548,   10,  210, 2938,  172, 4148,   11,    6,
         16,   54,  798,    8,   56,   17,  342,  338,  294,   16,  443,
         60,    6,   81,    3,  922,    1,  738,    6,   81,   62,   49,
         10,  235,   62, 2119,   42, 2944,  168,   17,   10,  433,    9,
         80,  597,   56,   17,  342,  338,   43,   22,   63,   11,   17,
        817,    9,   38,   57,  126,  786,    9,   84,   22,   80,  178,
          5,   63,    9,  172])

In [14]:
# See the mean and maximum of num_tokens
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)
print(np.mean(num_tokens))
print(np.max(num_tokens))

221.27716
2209


In [15]:
# Remove outliers
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

544

In [16]:
# About 94% texts are not outliers
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.94532

In [17]:
pad = 'pre'

# Padding into same length
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [18]:
print(x_train_pad.shape)
print(x_test_pad.shape)

(25000, 544)
(25000, 544)


In [19]:
x_train_pad[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [20]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [21]:
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token != 0]
    text = " ".join(words)
    return text

In [22]:
tokens_to_string(x_train_tokens[1])

"i really love this movie saw it again last week after 3 years or so this movie is perfect great acting great story great directing camera work music it is a gift to show it to someone you love too bad van did not make more movies after this one top 5 work really br br today it's 3 years and 3 days later then the comment above it was never posted because it was not more than 10 lines anyway i saw le again yesterday this is with no doubt in my movie top 3 together with cinema which is also a masterpiece the soundtrack is also really good i am really curious about von new movie i hope it will complete my movie top 3 if you see this movie rent it or even better buy it because you will want to see it again"

# Model Training

In [23]:
model = Sequential()

In [24]:
embedding_size = 8
num_words=10000

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

In [25]:
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru (GRU)                    (None, 544, 16)           1248      
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 8)            624       
_________________________________________________________________
gru_2 (GRU)                  (None, 4)                 168       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 82,045
Trainable params: 82,045
Non-trainable params: 0
_________________________________________________________________


In [27]:
optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [28]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 3min 22s, sys: 5.28 s, total: 3min 27s
Wall time: 3min 25s


<tensorflow.python.keras.callbacks.History at 0x7fc57071f278>

In [30]:
result = model.evaluate(x_test_pad, y_test)



In [31]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 85.55%


# Model Results

In [32]:
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

In [33]:
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])
cls_true = np.array(y_test[0:1000])

# See misclassified texts
incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]

In [34]:
len(incorrect)

153

In [46]:
idx = incorrect[0]
idx

18

In [47]:
text = x_test_text[idx]
print("TEXT:", text)
print("Predicted: ", y_pred[idx])
print("True class: ", cls_true[idx])

TEXT: Well, first of all - i am a big fanatic of horror movies, but however - I am pretty sick of all those damn American horror movies. They are all about the same thing - blood and violence. It's not even creepy. Well, it's nothin wrong with the blood and all that - doesn't even bother me - but that's not what makes a movie creepy! That's why I find this movie entertaining - it's fun to see a satire which is making fun of the koncept "main horror USA". American splatter/gore-movies, they are not suppose to be creepy, only funny. That's OK. But when they're suppose to be "creepy", it mostly gets pathetic. However, there are a few great american horror movies (Poltergeist, Psycho, Birds), but in the end it's all the same thing. That's why this movie came as a relief. Evil Ed is not just a cult movie - it's a classic! I can't wait untill master director Anders Jacobsson makes another goddamn splatter movie! Untill than I have to watch Evil Ed again - and again - and again! But I don't c

In [38]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [39]:
tokens = tokenizer.texts_to_sequences(texts)
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
tokens_pad.shape

(8, 544)

In [41]:
preds = model.predict(tokens_pad)

In [43]:
for i in range(8):
    print(texts[i])
    print("Predicted: ", preds[i])
    print()

This movie is fantastic! I really like it because it is so good!
Predicted:  [0.9770672]

Good movie!
Predicted:  [0.90219927]

Maybe I like this movie.
Predicted:  [0.23454425]

Meh ...
Predicted:  [0.77514803]

If I were a drunk teenager then this movie might be good.
Predicted:  [0.0375695]

Bad movie!
Predicted:  [0.09728283]

Not a good movie!
Predicted:  [0.63216877]

This movie really sucks! Can I get my money back please?
Predicted:  [0.02857998]

