# Lesson 5 - NLP

In [1]:
from theano.sandbox import cuda

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function
import keras

Using Theano backend.


In [3]:
model_path = "kaggle_data/imdb/models"

### Set up data

In [6]:
# imdb sentiment analysis dset is built into keras (like mnist)
from keras.datasets import imdb

In [7]:
idx = imdb.get_word_index()

In [12]:
# it's just a dict, words are keys, index is the value
# BUT - they have also been index by frequency of occurrence, so idx nr 3 is the 3rd most common word
type(idx)

dict

In [14]:
len(idx)

88584

In [17]:
# we can obtain the list of words from the idx
# we're using the get() method which will take a key and lookup the value of that key in the idx. So again, frequency.
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [29]:
# let's also get the mapping from idx to word
idx2word =  {i:w for w, i in idx.items()}
idx3word = {v: k for k, v in idx.iteritems()}

In [30]:
idx2word == idx3word

True

In [31]:
# load up the actual data
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [32]:
len(x_train)

25000

In [34]:
# let's grab the first review (the one about bromwell high) and decode it to words
x_train[0][:10]

[23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1]

In [35]:
review_in_words = []
for i in x_train[0]:
    review_in_words.append(idx2word[i])
    
review_in_words = " ".join(review_in_words)
review_in_words

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

In [36]:
# let's look at the labels (we're looking at 2 categories - positive and negative reviews)
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [37]:
# from what I see we might wanna shuffle before training.
labels_train[-10::]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

#### Reduce dimensionality by turning less frequent word tokens into the same thing

In [38]:
import numpy as np

In [42]:
vocab_size = 5000
trn = [np.array([word_idx if word_idx <vocab_size - 1 else vocab_size - 1 for word_idx in review]) for review in x_train]

In [44]:
# you can see that all of the less frequent words (with higher indices) became 4999.
# I think that's because there's also a zeroth word
trn[0]

array([4999,  309,    6,    3, 1069,  209,    9, 2175,   30,    1,  169,   55,   14,   46,   82,
       4999,   41,  393,  110,  138,   14, 4999,   58, 4477,  150,    8,    1, 4999, 4999,  482,
         69,    5,  261,   12, 4999, 4999, 2003,    6,   73, 2436,    5,  632,   71,    6, 4999,
          1, 4999,    5, 2004, 4999,    1, 4999, 1534,   34,   67,   64,  205,  140,   65, 1232,
       4999, 4999,    1, 4999,    4,    1,  223,  901,   29, 3024,   69,    4,    1, 4999,   10,
        694,    2,   65, 1534,   51,   10,  216,    1,  387,    8,   60,    3, 1472, 3724,  802,
          5, 3521,  177,    1,  393,   10, 1238, 4999,   30,  309,    3,  353,  344, 2989,  143,
        130,    5, 4999,   28,    4,  126, 4999, 1472, 2375,    5, 4999,  309,   10,  532,   12,
        108, 1470,    4,   58,  556,  101,   12, 4999,  309,    6,  227, 4187,   48,    3, 2237,
         12,    9,  215])

In [45]:
# repeat for test
test = [np.array([wi if wi < vocab_size -1 else vocab_size - 1 for wi in r])for r in x_test]
test[0]

array([  10,  432,    2,  216,   11,   17,  233,  311,  100,  109, 4999,    5,   31,    3,  168,
        366,    4, 1920,  634,  971,   12,   10,   13, 4999,    5,   64,    9,   85,   36,   48,
         10,  694,    4, 4999, 4999,   26,   13,   61,  499,    5,   78,  209,   10,   13,  352,
       4999,  253,    1,  106,    4, 3270, 4999,   52,   70,    2, 1839, 4999,  253, 1019, 4999,
         16,  138, 4999,    1, 1910,    4,    3,   49,   17,    6,   12,    9,   67, 2885,   16,
        260, 1435,   11,   28,  119,  615,   12,    1,  433,  747,   60,   13, 2959,   43,   13,
       3080,   31, 2126,  312,    1,   83,  317,    4,    1,   17,    2,   68, 1678,    5, 1671,
        312,    1,  330,  317,  134, 4999,    1,  747,   10,   21,   61,  216,  108,  369,    8,
       1671,   18,  108,  365, 2068,  346,   14,   70,  266, 2721,   21,    5,  384,  256,   64,
         95, 2575,   11,   17,   13,   84,    2,   10, 1464,   12,   22,  137,   64,    9,  156,
         22, 1916])

#### Show length of reviews
To figure out what the max, min and average is, so that we can again reduce some dimensions.

In [48]:
# we can use the map function to pass all elements of trn to the len() function and get an array of lenghts
lens = np.array(map(len, trn))

In [50]:
max(lens), min(lens), lens.mean()

(2493, 10, 237.71364)

In [51]:
# Seems that 500 words per review might be enough (given a mean of 240)

In [52]:
# there's a keras preprocessing function that helps turn vectors to a specific lenght (trimming and padding with
# a pre-specified padding char/number
from keras.preprocessing import sequence
pad_length = 500
pad_thingy = 0

# padding with 0 is tricky, it's "the", the most common word token. So we assume it will carry the least meaning
trn = sequence.pad_sequences(trn, maxlen=pad_length, value=pad_thingy)

In [54]:
# by default it pre-pads
trn[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,

In [55]:
# repeat for test
test = sequence.pad_sequences(test, maxlen=pad_length, value=pad_thingy)
test[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,

In [56]:
test.shape

(25000, 500)

## Create simple models
We'll start with a single hidden layer model, for a benchmark.

### Single Hidden NN
Note that we can't expect much by just feeding word indexes into the NN, we have to use the embedding, otherwise the would be worth zero and bromwell would be worth 4999.

In [59]:
# imports
from keras.layers import Dense, Dropout, Convolution1D, MaxPooling1D, Embedding, Flatten
from keras.models import Sequential
from keras.optimizers import Adam

In [68]:
# the input to the embedding layer is the number of word indices (nr of our vocab - they'll get looked up by idx)
# the output is the number of latent factors (dimension we think describe each word)
# and finally we need to tell it how long each sentence is (padded it to 500)
model = Sequential([
        Embedding(vocab_size, 32, input_length=pad_length),
        Flatten(), # gotta flatten cause now we have 500 words * 32 embeddings (matrix, not vector)
        Dense(100, activation="relu"),
        Dropout(.7),
        Dense(1, activation="sigmoid")
    ])

In [69]:
model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])

In [70]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 500, 32)       160000      embedding_input_2[0][0]          
____________________________________________________________________________________________________
flatten_2 (Flatten)              (None, 16000)         0           embedding_2[0][0]                
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 100)           1600100     flatten_2[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 100)           0           dense_3[0][0]                    
___________________________________________________________________________________________

In [71]:
model.fit(trn, labels_train, batch_size=32, nb_epoch=5, 
          validation_data=(test, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe3ebee1550>

### Conv + maxpooling
Is likely to work better since we have ordered data.

In [73]:
model = Sequential([
        Embedding(vocab_size, 32, input_length=pad_length),
        # Flatten() shouldn't be used because we're going to use convolution (like an image, flatten after all convo
        Convolution1D(64, 5, border_mode="same", activation="relu"), # we have to set the filter nr and size (65, 5)
        MaxPooling1D(),
        Flatten(),
        Dense(100, activation="relu"),
        Dropout(.7),
        Dense(1, activation="sigmoid")
    ])

In [74]:
model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])

In [75]:
model.fit(trn, labels_train, validation_data=(test, labels_test), batch_size=32, nb_epoch=5)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe3dae93710>

In [82]:
# we can adjust and use dropout within the embedding too (effectively dropping certain words)
conv2 = Sequential([
        Embedding(vocab_size, 32, input_length=pad_length, dropout=0.2), # adding dropout within the embedding
        Dropout(.2),
        Convolution1D(64, 5, border_mode="same", activation="relu"),
        Dropout(.2),
        MaxPooling1D(),
        Flatten(),
        Dense(100, activation="relu"),
        Dropout(.7),
        Dense(1, activation="sigmoid")
    ])

In [83]:
conv2.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])

In [84]:
conv2.fit(trn, labels_train, validation_data=(test, labels_test), batch_size=64, nb_epoch=5)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe3d1712b50>

In [85]:
conv2.save_weights(model_path + "conv2.h5")

In [86]:
conv2.load_weights(model_path + "conv2.h5")

In [87]:
# zippity doo-dah, we've beaten academia
# now we'll try to use the glove embeddings instead of training our own

### Pretrained Glove Embeddings
Let's use those and see what happizens. 

In [88]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [92]:
glove_path = 'kaggle_data/glove/results/'
vecs, words, wordidx = load_vectors(glove_path + '6B.50d')

In [97]:
# so we've got the actual vectors in vecs, the words in words and a matching dictionary in wordidx
words[:10]

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s"]

In [101]:
# we want to figure out how to remap our words and indexes into Glove's, and handle cases where words have no match
vecs.shape[0], vecs.shape[1] # 50 dimensions per word in Glove


(400000, 50)

In [102]:
def create_emb():
    # this grabs the number of dimensions per word in Glove (50)
    n_fact = vecs.shape[1]
    
    # this creates a zero-filled matrix of 4999 by 50 (so each of the words in our vocabulary will get turned to 
    # 50 dimensional vector from Glove
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        # here we grab each of our current words (5000)
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            # here we grab the index in Glove of that word
            src_idx = wordidx[word]
            # and we put the corresponding vector in our embedding matrix, at the index of that word
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [103]:
emb = create_emb()

In [106]:
emb.shape

(5000, 50)

In [111]:
emb[4999]

array([-0.0406, -0.0468,  0.0542, -0.0995,  0.1925, -0.177 ,  0.1945,  0.4075,  0.2944,  0.04  ,
        0.2115, -0.0569, -0.0217,  0.3187, -0.2585,  0.0222,  0.4323, -0.2497, -0.0955, -0.1574,
        0.0973,  0.0809, -0.1142, -0.0458,  0.0273, -0.1005, -0.1229, -0.1304,  0.2392, -0.0945,
       -0.2137, -0.0172, -0.3156,  0.2925, -0.0208,  0.259 , -0.0255, -0.0654, -0.1538,  0.1738,
        0.0362, -0.113 , -0.1628, -0.0823, -0.0871,  0.2895,  0.1051,  0.1764,  0.01  ,  0.5174])

In [112]:
# notice that our 0 (the) becomes meaningless because its vector gets filled with 0
emb[0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [114]:
# we can very easily pass our embedding matrix to the keras.layers.Embedding constructor (weights parameter)

glove_model = Sequential([
        Embedding(vocab_size, 50, input_length=pad_length, weights=[emb], dropout=0.2, trainable=False),
        # since we want to keep these embeddings we set them to be non-trainable
        Dropout(.25),
        Convolution1D(64, 5, activation="relu"),
        Dropout(.25),
        MaxPooling1D(),
        Flatten(),
        Dense(100, activation="relu"),
        Dropout(.7),
        Dense(1, activation="sigmoid"),
    ])

In [115]:
glove_model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])

In [116]:
glove_model.fit(trn, labels_train, validation_data=(test, labels_test), batch_size=64, nb_epoch=5)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe3be7e6150>

In [117]:
# we can also try to finetune the original glove weights - especially since words that weren't in the set
# of the Glove tokens get random weights.
# we just needs to reset the first (Embedding) layer to trainable

In [121]:
model.layers[0].trainable = True
model.optimizer.lr = 1e-4
glove_model.fit(trn, labels_train, validation_data=(test, labels_test), batch_size=64, nb_epoch=5)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe3bee25210>

### Multi-size CNN
From the thing Ben Bowles did in Quid. The idea here is to use the Functional, instead of Sequential model and then add the results from different-sized convolutional layers and concatenate them and pass as vectors to the Dense layer.

In [123]:
from keras.layers import Merge

In [126]:
# we're still using our Glove embeddings as input
graph_in = Input((vocab_size, 50))
convs = []

# we'll try 3, 4 and 5 as filter sizes
for fsz in range(3, 6):
    x = Convolution1D(64, fsz, border_mode="same", activation="relu")(graph_in)
    x = MaxPooling1D()(x)
    x = Flatten()(x)
    convs.append(x)

out = Merge(mode="concat")(convs)
graph = Model(graph_in, out)

In [127]:
emb = create_emb()

In [129]:
# replace the conv/maxpool layers in our original model with the multi-size convs
multi_conv_model = Sequential([
        Embedding(vocab_size, 50, input_length=pad_length, weights=[emb]),
        Dropout(.2),
        graph,
        Dropout(.5),
        Dense(100, activation="relu"),
        Dropout(.7),
        Dense(1, activation="sigmoid"),
    ])

In [130]:
multi_conv_model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])

In [131]:
multi_conv_model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=5, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe3ba72c310>

### LSTM
Beyon the beyond.

In [132]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=pad_lengthngth, mask_zero=True,
              W_regularizer=l2(1e-6), dropout=0.2),
    LSTM(100, consume_less='gpu'),
    Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_11 (Embedding)         (None, 500, 32)       160000      embedding_input_10[0][0]         
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           53200       embedding_11[0][0]               
____________________________________________________________________________________________________
dense_20 (Dense)                 (None, 1)             101         lstm_1[0][0]                     
Total params: 213301
____________________________________________________________________________________________________


In [133]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fe3a988c910>