In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random
from scipy.spatial.distance import cdist

In [2]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
import imdb

In [4]:
imdb.maybe_download_and_extract()

Data has apparently already been downloaded and unpacked.


## Load training and test sets

In [5]:
x_train, y_train = imdb.load_data(train=True)
x_dev, y_dev = imdb.load_data(train=False)

In [6]:
print("\nTrain set size: ", len(x_train))
print("\nDev set size: ", len(x_dev))


Train set size:  25000

Dev set size:  25000


Not a very good split of train and dev sets. We'll split the dataset into 80-20 ratio

In [7]:
data_text = x_train + x_dev
data_labels = y_train + y_dev

In [8]:
random.seed(1)
idx = list(range(len(data_text)))
shuffled = random.shuffle(idx)
cut = int(0.8*len(data_text))

x_train_text = [data_text[i] for i in idx[:cut]]
y_train = [data_labels[i] for i in idx[:cut]]
x_dev_text = [data_text[i] for i in idx[cut:]]
y_dev = [data_labels[i] for i in idx[cut:] ]

In [9]:
print("\nTrain set size: ", len(x_train_text))
print("\nTest set size: ", len(x_dev_text))


Train set size:  40000

Test set size:  10000


## Tokenizer

In [10]:
num_words = 10000 #Max no of words to be in the vocabulary

In [11]:
tokenizer = Tokenizer(num_words= num_words)

In [12]:
tokenizer.fit_on_texts(data_text)

Words are tokenized based on frequency of occurence

In [13]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [14]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text) #Tokenizing the the training text

In [15]:
x_train_text[1]

"Dr Tarr's Torture Dungeon is about a journalist who travels to an insane asylum to write about a new technique they use with their patients. However, the journalist soon finds out that things are not what they seem to be, and the asylum is being run by the patients, and the doctors are in cages. First of all, some parts of the film are just plain boring and just makes you want to fall asleep, and the interesting parts are interesting for all the wrong reasons(A guy who thinks he's a chicken, for instance). I have to admit that the story is actually pretty good, but the film itself bombs. The music of the film is really odd and like something you would hear in some insane comedy, and yes, there is a scene involving dancing chicken men, which pretty much made me want to shut off the screen. Watch this film at your own risk! <br /><br />Rated: R for Violence and Nudity."

**Each number in the next cell corresponds to the token number of the word in the sentence above**

In [16]:
np.array(x_train_tokens[1])

array([ 797, 1633, 9668,    6,   42,    3, 4419,   35, 3525,    5,   32,
       2015, 5095,    5,  894,   42,    3,  168, 2977,   33,  356,   16,
         65, 4562,  188,    1, 4419,  526,  688,   41,   12,  177,   23,
         21,   48,   33,  308,    5,   26,    2,    1, 5095,    6,  109,
        495,   31,    1, 4562,    2,    1, 4931,   23,    8,   86,    4,
         29,   47,  516,    4,    1,   19,   23,   39, 1043,  348,    2,
         39,  162,   22,  178,    5,  778, 2401,    2,    1,  218,  516,
         23,  218,   15,   29,    1,  357,  994,    3,  219,   35, 1252,
        237,    3, 4284,   15, 1951,   10,   25,    5,  987,   12,    1,
         64,    6,  160,  180,   49,   18,    1,   19,  392, 5514,    1,
        207,    4,    1,   19,    6,   62, 1036,    2,   37,  137,   22,
         58,  846,    8,   47, 2015,  202,    2,  422,   46,    6,    3,
        129, 1195, 1103, 4284,  345,   60,  180,   72,   90,   68,  178,
          5, 2908,  122,    1,  258,  103,   11,   

In [17]:
x_dev_tokens = tokenizer.texts_to_sequences(x_dev_text) #Tokenizing the dev set 

## Padding and Truncating

The range of the sequence lengths in the dataset is quite big. RNNs can theoretically run on variable sequence lenghts, but:
1. Due to the implementation in Tensorflow, 
2. The usage of batches to optimize the loss function (easier to represent if it's of uniform size)
3. For the sake of efficiency 

we'll be using sequences of fixed length. We'll convert the data to fixed length by padding or truncating the sequences.

What size do we use for the sequence? If we assume a normal distribution for the lengths of the sequences i.e most sequences are of average length and very few with either very high or very low sequence lengths, then we can use a length of **mean + 2*standard_deviation** which covers 95% of the data

In [18]:
num_tokens = [len(tokens) for tokens in x_train_tokens+x_dev_tokens]
num_tokens = np.array(num_tokens)
num_tokens.shape

(50000,)

In [19]:
print("max length: ", np.max(num_tokens))


max length:  2209


In [20]:
np.mean(num_tokens)

221.27716

In [21]:
max_tokens = np.mean(num_tokens) + 2*np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

544

In [22]:
np.sum(num_tokens<=max_tokens)/len(num_tokens)

0.94556

#### We're using pre padding here which means 0s will be added at the beginning of the sequence


In [23]:
pad = 'pre'

In [24]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens, padding=pad, truncating=pad)
x_dev_pad = pad_sequences(x_dev_tokens, maxlen=max_tokens, padding=pad, truncating=pad)

In [25]:
print(x_train_pad.shape)
print(x_dev_pad.shape)

(40000, 544)
(10000, 544)


## Integer to word dictionary (Inverse of tokenizer)

In [26]:
idx = tokenizer.word_index 
inverse_map = dict(zip(idx.values(), idx.keys()))


In [27]:
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token!=0]
    text = " ".join(words)
    return text

In [28]:
tokens_to_string(x_train_tokens[1])

"dr torture dungeon is about a journalist who travels to an insane asylum to write about a new technique they use with their patients however the journalist soon finds out that things are not what they seem to be and the asylum is being run by the patients and the doctors are in first of all some parts of the film are just plain boring and just makes you want to fall asleep and the interesting parts are interesting for all the wrong reasons a guy who thinks he's a chicken for instance i have to admit that the story is actually pretty good but the film itself bombs the music of the film is really odd and like something you would hear in some insane comedy and yes there is a scene involving dancing chicken men which pretty much made me want to shut off the screen watch this film at your own risk br br rated r for violence and nudity"

## Building the model

In [29]:
model = Sequential() #Using a sequential model RNN

Although we've converted the words to tokens, we can't input this format to the RNN. We represnet the words as vectors (this is called embedding). There are various ways to generate this vector, the popular ones being Word2Vec and GloVe.

This is similar to learning features in an image. Words having similar meanings will have similar vectors or embeddings


In [30]:
embedding_size = 10 #Size of the embedding vector

In [31]:
model.add(Embedding(input_dim=num_words, output_dim=embedding_size, input_length=max_tokens, name='layer_embedding'))

Instructions for updating:
Colocations handled automatically by placer.


In [32]:
# 3 GRU layers
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))

In [33]:
# 1 densely connected layer with a single output unit and sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

In [34]:
# Adam optimizer
optimizer = Adam(lr=0.01)

In [35]:
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

## Summary of the model architecture

In [36]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 10)           100000    
_________________________________________________________________
gru (GRU)                    (None, 544, 16)           1296      
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 8)            600       
_________________________________________________________________
gru_2 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 102,057
Trainable params: 102,057
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [37]:
model.fit(x_train_pad, y_train, validation_split=0.05, epochs=3, batch_size=64)

Train on 38000 samples, validate on 2000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1d1914f53c8>

## Evaluate the model on the dev set 

The accuracy can be improved by:
1. Using higher dimensions for embedding
2. Training for more epochs
3. Obtaining a bigger dataset

In [38]:
result=model.evaluate(x_dev_pad, y_dev)



In [39]:
print("Accuracy {0:.2%}".format(result[1]))

Accuracy 89.91%


## Misclassified samples


In [40]:
y_pred = model.predict(x_dev_pad[:1000]) #Predicted label of the 1st 1000 dev samples
y_pred = y_pred.T[0]

In [45]:
class_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred]) #Setting a threshold of 0.5 for binary classification

In [46]:
class_true = y_dev[:1000] #True labels for the 1000 dev samples
np.array(class_true).shape

(1000,)

In [47]:
incorrect = np.where(class_pred != class_true) #Get the indeces of the samples with incorrect predictions
incorrect = incorrect[0]

In [48]:
len(incorrect)

97

In [49]:
idx = incorrect[1]
idx

11

In [51]:
x_dev_text[idx]

'They are hunted and starving. They are completely demoralized and yet they press on through sheer inertia. This film tries to answer the question "How far will human beings go to survive?" Hopelessness emanates from every of this film and like so many japanese films of this time, it condemns the blind military loyalty that pressed the japanese people into war.'

In [52]:
y_dev[idx]

1.0

In [53]:
y_pred[idx]

0.2675526

## New data

In [58]:
text1 = "Excellent movie" # Positive(1)
text2 = "Movie was good and entertaining" # Positive(1)
text3 = "Not a particularly good or entertaining movie" # Negative(0). But the algprithm might get confused 
                                                                    #  because of the order of words
text4 = "Neutral" # Neutral(Should be close to 0.5)
text5 = "Horrible experience" # Negative(0)

text = [text1, text2, text3, text4, text5 ]

In [59]:
new_text = tokenizer.texts_to_sequences(text)

In [60]:
new_text_pad = pad_sequences(new_text, maxlen=max_tokens, padding=pad, truncating=pad)

In [61]:
y_new = model.predict(new_text_pad)
y_new

array([[0.829178  ],
       [0.8697962 ],
       [0.31889778],
       [0.14172691],
       [0.03660696]], dtype=float32)