# Glove to word vectors

In [1]:
import pickle
import numpy as np

In [2]:
filename = 'glove.6B.100d.pkl'

In [3]:
with open(filename, 'rb') as f:
    glove = pickle.load(f)

In [4]:
print(len(glove.keys()))

400000


# Reuters' IMDB dataset

In [5]:
from keras.datasets import reuters

Using TensorFlow backend.


In [None]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)

Downloading data from https://s3.amazonaws.com/text-datasets/reuters.npz

In [None]:
word_index = reuters.get_word_index()

In [None]:
word_freq = sorted(word_index, key=word_index.get)
idx2word = {word_index[w] : w for w in word_freq}

### Peek the reuters newswire

Note to subtract the index_from and start_char

In [None]:
' '.join([idx2word[i - 3] for i in x_train[0][1:]])

### Set the vocab size to 5k most commonly used words

In [10]:
vocab_size = 5000

train = [np.array([i - 3 if i <= vocab_size - 1 - 3 else vocab_size - 1 - 3 for i in cmt[1:]]) for cmt in x_train]
test = [np.array([i - 3 if i <= vocab_size - 1 - 3 else vocab_size - 1 - 3 for i in cmt[1:]]) for cmt in x_test]

In [11]:
print(' '.join([idx2word[i] for i in train[0]]))
print('--------------')
print(' '.join([idx2word[i] for i in test[0]]))

this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert orange is an amazing actor and now the same being director orange father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for orange and would recommend it to everyone to watch and the fly orange was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also orange to the two little orange that played the orange of norman and paul they were just brilliant children are often left out of the orange list i think because the stars that play them all grown up are such a big orange for the whole film but these children are amazing and should be orange for what th

### Padding each review to 500

In [12]:
seq_len = 500

from keras.preprocessing.sequence import pad_sequences
train = pad_sequences(train, maxlen=seq_len, value=0)
test = pad_sequences(test, maxlen=seq_len, value=0)

In [13]:
print(train.shape)

(25000, 500)


### Create embedding matrix

In [14]:
embedding_matrix = np.zeros((vocab_size, 100))

for i in range(1, vocab_size):
    w = idx2word[i]
    v = glove.get(w)
    if v is not None:
        embedding_matrix[i] = v
    else:
        embedding_matrix[i] = np.random.normal(scale=.6, size=(100,))
embedding_matrix[-1] = np.random.normal(scale=.6, size=(100,))

# Keras Embedding Layer

In [28]:
from keras.layers import Input, Flatten, Dropout, Dense, Embedding, Conv1D, MaxPooling1D, SpatialDropout1D, GlobalMaxPooling1D, Activation
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.layers.recurrent import LSTM

In [16]:
embedding_layer = Embedding(
    vocab_size,
    100,
    weights = [embedding_matrix],
    input_length = seq_len,
    trainable=False
)

In [36]:
sequence_input = Input(shape=(seq_len,), dtype=np.int32)
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)  # global max pooling
x = Dense(128, activation='relu')(x)
y = Dense(46, activation='softmax')(x)

model = Model(sequence_input, y)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 500, 100)          500000    
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 496, 128)          64128     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 99, 128)           0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 95, 128)           82048     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 19, 128)           0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 15, 128)           82048     
__________

In [37]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='rmsprop',
    metrics=['acc']
)

In [38]:
history = model.fit(train, y_train, epochs=3, validation_data=(test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
