In [1]:
from __future__ import print_function

import warnings
warnings.filterwarnings("ignore")

import logging
import numpy as np

%matplotlib inline

### Setting up random state for reproducibility

In [2]:
RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

### Setting up logger

In [3]:
# Logging configuration

logger = logging.getLogger(__name__)

handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s %(name)-5s %(levelname)-5s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

### Defining constants

In [4]:
VOCABULARY_SIZE = 5000
EMBEDDING_SIZE = 300
BATCH_SIZE = 64
NUM_EPOCHS = 10
MAX_REVIEW_LENGTH = 300

### Load Word2Vec model


Following code loads Word2Vec model for getting embeddings for words

In [5]:
from gensim.models import KeyedVectors

w2v = KeyedVectors.load_word2vec_format('~/embeddings/GoogleNews-vectors-negative300.bin', binary=True)

### Restrict maximum features

We restrict the maximum number of features a.k.a. our inputs to be 5000. So only top 5000 words will be chosen from IMDB dataset. `load_data` automatically does a 50:50 train test split.

In [6]:
from keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCABULARY_SIZE)

Using TensorFlow backend.


In [7]:
logger.debug('Length of X_train: %(len)s', {'len': len(x_train)})
logger.debug('Length of X_test: %(len)s', {'len': len(x_test)})

2018-04-22 16:54:49,589 __main__ DEBUG Length of X_train: 25000
2018-04-22 16:54:49,590 __main__ DEBUG Length of X_test: 25000


In [8]:
from keras.preprocessing import sequence

X_train = sequence.pad_sequences(x_train, maxlen=MAX_REVIEW_LENGTH)
X_test = sequence.pad_sequences(x_test, maxlen=MAX_REVIEW_LENGTH)

In [9]:
logger.debug('Shape of X_train: %(shape)s', {'shape': X_train.shape})
logger.debug('Shape of X_test: %(shape)s', {'shape': X_test.shape})

2018-04-22 16:54:50,459 __main__ DEBUG Shape of X_train: (25000, 300)
2018-04-22 16:54:50,460 __main__ DEBUG Shape of X_test: (25000, 300)


### Embedding matrix

In [10]:
word_to_index = imdb.get_word_index()

In [11]:
index_to_word = {v : k  for k, v in word_to_index.items()}

In [12]:
embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))

null_embeddings = 0
for word, index in word_to_index.items():
    if index > VOCABULARY_SIZE:
        continue

    try:
        embedding_weights[index, :] = w2v[word]
    except KeyError:
        null_embeddings += 1 # Keep count of null embeddings

In [13]:
null_embeddings

154

### Simple LSTM

In [14]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding

model = Sequential()
model.add(Embedding(VOCABULARY_SIZE, EMBEDDING_SIZE, weights=[embedding_weights], trainable=True))
model.add(LSTM(128, dropout = 0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid', kernel_initializer='glorot_normal'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         1500000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 1,719,777
Trainable params: 1,719,777
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(X_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f111cfe34a8>