In [1]:
from __future__ import print_function

import warnings
warnings.filterwarnings("ignore")

import logging
import numpy as np

%matplotlib inline

### Setting up random state for reproducibility

In [2]:
RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

### Setting up logger

In [3]:
# Logging configuration

logger = logging.getLogger(__name__)

handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s %(name)-5s %(levelname)-5s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

### Load Word2Vec model


Following code loads Word2Vec model for getting embeddings for words

In [4]:
from gensim.models import KeyedVectors

w2v = KeyedVectors.load_word2vec_format('~/embeddings/GoogleNews-vectors-negative300.bin', binary=True)

### Restrict maximum features

We restrict the maximum number of features a.k.a. our inputs to be 5000. So only top 5000 words will be chosen from IMDB dataset. `load_data` automatically does a 50:50 train test split.

In [5]:
max_features = 5000
max_review_length = 500

In [6]:
from keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

Using TensorFlow backend.


In [8]:
logger.debug('Length of X_train: %(len)s', {'len': len(x_train)})
logger.debug('Length of X_test: %(len)s', {'len': len(x_test)})

2018-04-04 22:28:20,481 __main__ DEBUG Length of X_train: 25000
2018-04-04 22:28:20,482 __main__ DEBUG Length of X_test: 25000


In [9]:
from keras.preprocessing import sequence

X_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(x_test, maxlen=max_review_length)

In [10]:
logger.debug('Shape of X_train: %(shape)s', {'shape': X_train.shape})
logger.debug('Shape of X_test: %(shape)s', {'shape': X_test.shape})

2018-04-04 22:28:24,243 __main__ DEBUG Shape of X_train: (25000, 500)
2018-04-04 22:28:24,244 __main__ DEBUG Shape of X_test: (25000, 500)


In [12]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding

model = Sequential()