In [10]:
import os
import numpy as np
import bcolz
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.layers.core import Dropout
from keras.preprocessing import sequence
from keras.utils.data_utils import get_file
from keras.optimizers import Adam
import pickle

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [2]:
GLOVE_DIR = './data/glove.6B/'

In [3]:
EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [4]:
idx = imdb.get_word_index()

print('Found %s word indices.' % len(idx))

Found 88584 word indices.


In [5]:
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

[u'the', u'and', u'a', u'of', u'to', u'is', u'br', u'in', u'it', u'i']

In [6]:
idx2word = {v: k for k, v in idx.iteritems()}

In [7]:
# load the dataset but only keep the top n words, zero the rest
top_words = 20000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# pad dataset to a maximum review length in words
max_words = 1000
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [8]:
embedding_matrix = np.zeros((top_words, EMBEDDING_DIM))

for i in range(1, len(embedding_matrix)):
    word = idx2word[i]
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [12]:
model = Sequential([
    Embedding(top_words, EMBEDDING_DIM, input_length=max_words, 
              weights=[embedding_matrix], trainable=False),
    Dropout(0.25),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [13]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1
 1984/25000 [=>............................] - ETA: 344s - loss: 0.9006 - acc: 0.4980