In [43]:
from keras.datasets import imdb
from keras import preprocessing

In [57]:
# Max words to use as features
max_features = 10000

# Max length of review (truncate to this size)
max_length = 20

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print("Number of training samples:", len(x_train))

Number of training samples: 25000


The data is loaded as a list of integers, where x_train[i] is a given review of different length. We'll cut the reviews to only the first 20 words. Shorter reviews will be padded with 0s up to 20 tokens. The following lines will turn a list of integers into a 2D integer tensor of shape (samples, max_length).

In [63]:
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=max_length)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=max_length)
print(x_train.shape)

(25000, 20)


In [76]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

model = Sequential()
model.add(Embedding(25000, 8, input_length=max_length))

model.add(Flatten())

# model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 20, 8)             200000    
_________________________________________________________________
flatten_6 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 161       
Total params: 200,161
Trainable params: 200,161
Non-trainable params: 0
_________________________________________________________________


In [77]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [90]:
# I think this is how you can access the embeddings..
emb = model.layers[0]
weights = emb.get_weights()
weights[0][0]

array([-0.06838239,  0.02226386,  0.03244511,  0.04004373, -0.14269125,
        0.09330799, -0.0924679 ,  0.0438177 ], dtype=float32)

## Using pretrained word embeddings

In this case we'll use the raw data of the IMDB dataset, not the one included in the Keras module.

In [91]:
import os

imdb_dir = '/Users/Misko/PycharmProjects/Playfield/data/aclImdb/'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == ".txt":
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

25000

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Max review length
maxlen =  100
# Use a small training dataset, since we're using pretrained embeddings
training_samples = 200
validation_samples = 10000
# Use only the top 10000 words in the dataset
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

# Convert strings to integer indices
sequences = tokenizer.texts_to_sequences(texts)

# word_index will contain all the unique words in the text, ranked by appearance
word_index = tokenizer.word_index
print("Found %d unique tokens." % len(word_index))

In [119]:
# Cutoff to a `maxlen` length of review
# It seems that in case a sequence is longer than maxlen, the `pad_sequences` method will keep the last 
# maxlen tokens and drop the rest at the start of the sequence. 
# It might be better to do it the other way around?
data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Randomly shuffle the samples and corresponding labels
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


We can now use the GloVe embeddings, which were previously downloaded from this [link](https://nlp.stanford.edu/projects/glove/). We'll use the GloVe embeddings using 100 dimensions ie., each word (of the 400000 ones) will be encoded in a 100D vector.

In [120]:
glove_dir = '/Users/Misko/PycharmProjects/Playfield/data/glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [134]:
word_index.pop('an')

32

In [122]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        
        

(100,)

In [42]:
import numpy as np

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
dimensionality = 1000
max_length = 10
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dimensionality
        results[i, j, index] = 1.
        

(2, 10, 1000)


array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]