https://github.com/fastai/courses/blob/master/deeplearning1/nbs/lesson5.ipynb

In [10]:
from theano.sandbox import cuda

import pandas as pd
import numpy as np

import utils; reload(utils)
from utils import *

from IPython.display import display

# Setup Data

In [116]:
# Get data
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')

f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)
    

In [118]:
len(x_train)

25000

In [102]:
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

## Get converter

In [119]:
from keras.datasets import imdb
idx = imdb.get_word_index()

In [120]:
idx2word = {v:k for k,v in idx.items()}
idx2word[0] = '??'

In [121]:
len(idx2word)

88585

In [122]:
def printReview(idxArray):
    string = [idx2word[w] for w in idxArray]
    print ' '.join(string)

printReview(x_train[2])

brilliant over acting by lesley ann warren best dramatic hobo lady i have ever seen and love scenes in clothes warehouse are second to none the corn on face is a classic as good as anything in blazing saddles the take on lawyers is also superb after being accused of being a turncoat selling out his boss and being dishonest the lawyer of pepto bolt shrugs indifferently i'm a lawyer he says three funny words jeffrey tambor a favorite from the later larry sanders show is fantastic here too as a mad millionaire who wants to crush the ghetto his character is more malevolent than usual the hospital scene and the scene where the homeless invade a demolition site are all time classics look for the legs scene and the two big diggers fighting one bleeds this movie gets better each time i see it which is quite often


## Limit to top words

In [141]:
vocab_size = 5000
def clipVocab(data, vocab_size = 5000-1):
    return [np.array([i if i<vocab_size else vocab_size for i in s]) for s in data]
trn = clipVocab(x_train)
test = clipVocab(x_test)
printReview(test[2])
idx2word[4999]

as a bergman bergman with some knowledge of the bergman history i was pleased with disney's bergman to the issues of class in bergman in the early bergman century the movie depicted well the psychological battles that harry bergman within himself from his childhood bergman of being bergman to his own bergman to break that glass bergman that bergman him from being accepted as an equal in english bergman society likewise the young goes through his own class struggles being a mere bergman in the eyes of the upper bergman americans who bergman at his attempts to rise above his standing br br what i loved best however is how this theme of class is bergman in the characters of parents his father is a working class bergman who sees the value of hard work but is bergman by the upper class his mother however bergman her bergman talent and desire and bergman him to bergman his dream of bergman against those who think he is inferior br br finally the bergman scenes are well photographed although 

'bergman'

## Pad Words

In [131]:
# lets look at the various lengths of reviews
lens = np.array(map(len, trn))
(lens.max(), lens.min(), lens.mean())



(2493, 10, 237.71364)

In [144]:
# Pad to double the mean length of the review
seqLen = 500
def padInput(inp, seqLen):
    return keras.preprocessing.sequence.pad_sequences(inp, maxlen=seqLen, dtype='int32', value=0)

trn = padInput(trn, seqLen)
test = padInput(test, seqLen)

In [126]:
readReview(trn[1])

"?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? bergman or bergman as george bergman stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the matter most people think of the homeless as just a lost cause while bergman about things such as racism the war on iraq bergman kids to succeed technology the bergman bergman or bergman if they'll be next to end up on the streets br br but what if you were given a bet to live on the streets for a month without the bergman you once had from a home the entertainment sets a bathroom pictures on the wall a computer and everything you once treasure to see what it's like to be homeless that is bergman bergman lesson br br mel brooks who directs who stars as bergman plays a rich man who h

In [127]:
trn.shape

(25000, 500)

In [135]:
test.shape

(25000, 500)

In [136]:
seqLen

500

# Simple NN Model

In [147]:
embeddings = 32

nnModel = Sequential([
    Embedding(vocab_size, embeddings, input_length=seqLen),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])
nnModel.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
nnModel.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, 500, 32)       160000      embedding_input_5[0][0]          
____________________________________________________________________________________________________
flatten_6 (Flatten)              (None, 16000)         0           embedding_6[0][0]                
____________________________________________________________________________________________________
dense_10 (Dense)                 (None, 100)           1600100     flatten_6[0][0]                  
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 100)           0           dense_10[0][0]                   
___________________________________________________________________________________________

In [148]:
nnModel.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=8, batch_size=256)

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7fe345736690>


The stanford paper that this dataset is from cites a state of the art accuracy (without unlabelled data) of 0.883. So we're short of that, but on the right track.



# Convolutional NN Model

Dropout parameter on the embedding layer drops out from the latent factors

In [152]:
convModel = Sequential([
    Embedding(vocab_size, 32, input_length=seqLen, dropout=0.2),
    Dropout(0.2),
    Convolution1D(64, 5, border_mode='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])
convModel.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
convModel.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_9 (Embedding)          (None, 500, 32)       160000      embedding_input_8[0][0]          
____________________________________________________________________________________________________
dropout_13 (Dropout)             (None, 500, 32)       0           embedding_9[0][0]                
____________________________________________________________________________________________________
convolution1d_3 (Convolution1D)  (None, 500, 64)       10304       dropout_13[0][0]                 
____________________________________________________________________________________________________
dropout_14 (Dropout)             (None, 500, 64)       0           convolution1d_3[0][0]            
___________________________________________________________________________________________

In [153]:
convModel.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=8, batch_size=256)

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7fe31a189310>

In [154]:
convModel.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=3, batch_size=256)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe31a189c90>

# Using precomputed embeddings

In [166]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('./data/imdbsentiment/glove')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

In [164]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [167]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

Untaring file...


In [172]:
(vecs.shape)

(400000, 50)

In [177]:
words[:20]

['the',
 ',',
 '.',
 'of',
 'to',
 'and',
 'in',
 'a',
 '"',
 "'s",
 'for',
 '-',
 'that',
 'on',
 'is',
 'was',
 'said',
 'with',
 'he',
 'as']

In [182]:
wordidx

{'biennials': 130852,
 'verplank': 42458,
 'soestdijk': 274735,
 'woode': 311324,
 'mdbo': 212156,
 'sowell': 65544,
 'mdbu': 119490,
 'woods': 2507,
 'spiders': 19780,
 'mdbs': 285106,
 'mdbr': 131910,
 'woody': 10967,
 'trawling': 56782,
 'hwasung': 389348,
 'spidery': 126257,
 'regularize': 106404,
 'hennings': 85340,
 'canes': 34403,
 'canet': 110349,
 'caney': 97434,
 'yusaf': 223713,
 'chanthaburi': 154208,
 'igual': 120122,
 'hallucinate': 140964,
 'caned': 78728,
 'mirisch': 180868,
 'kalecik': 246349,
 'rickman': 54345,
 'jacquemod': 117004,
 'ioannidis': 142172,
 'canel': 299678,
 'canem': 357071,
 'afikoman': 354205,
 'dra\xc5\xbea': 228083,
 'heliothis': 265609,
 'replacer': 292996,
 'pigment': 29943,
 'bogyman': 343214,
 'transvestism': 191391,
 'nmu': 155319,
 '\xc4\x91\xc6\xb0\xe1\xbb\x9dng': 265245,
 '18f': 251230,
 'seamier': 168003,
 'illtyd': 307371,
 'wooded': 17054,
 'shipworms': 359396,
 'boorstein': 310641,
 'grueling': 17482,
 'persita': 353293,
 'wooden': 4836,

In [183]:
idx2word = {v:k for k,v in wordidx.items()}
idx2word

{0: 'the',
 1: ',',
 2: '.',
 3: 'of',
 4: 'to',
 5: 'and',
 6: 'in',
 7: 'a',
 8: '"',
 9: "'s",
 10: 'for',
 11: '-',
 12: 'that',
 13: 'on',
 14: 'is',
 15: 'was',
 16: 'said',
 17: 'with',
 18: 'he',
 19: 'as',
 20: 'it',
 21: 'by',
 22: 'at',
 23: '(',
 24: ')',
 25: 'from',
 26: 'his',
 27: "''",
 28: '``',
 29: 'an',
 30: 'be',
 31: 'has',
 32: 'are',
 33: 'have',
 34: 'but',
 35: 'were',
 36: 'not',
 37: 'this',
 38: 'who',
 39: 'they',
 40: 'had',
 41: 'i',
 42: 'which',
 43: 'will',
 44: 'their',
 45: ':',
 46: 'or',
 47: 'its',
 48: 'one',
 49: 'after',
 50: 'new',
 51: 'been',
 52: 'also',
 53: 'we',
 54: 'would',
 55: 'two',
 56: 'more',
 57: "'",
 58: 'first',
 59: 'about',
 60: 'up',
 61: 'when',
 62: 'year',
 63: 'there',
 64: 'all',
 65: '--',
 66: 'out',
 67: 'she',
 68: 'other',
 69: 'people',
 70: "n't",
 71: 'her',
 72: 'percent',
 73: 'than',
 74: 'over',
 75: 'into',
 76: 'last',
 77: 'some',
 78: 'government',
 79: 'time',
 80: '$',
 81: 'you',
 82: 'years',
 83

In [None]:
## Next step recreate below code
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb