In [149]:
from theano.sandbox import cuda

In [150]:
%matplotlib inline
from __future__ import division, print_function

In [137]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers.convolutional import *
from keras.utils.np_utils import to_categorical

In [1]:
from keras.datasets import reuters

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled)


In [29]:
idx = reuters.get_word_index(path="reuters_word_index.pkl")

In [30]:
len(idx.keys())

30979

In [34]:
idx_arr = sorted(idx, key=idx.get)
idx['the'], idx['dollar'], idx_arr[:10]

(1, 110, ['the', 'of', 'to', 'in', 'said', 'and', 'a', 'mln', '3', 'for'])

In [32]:
idx2word = {v: k for k, v in idx.iteritems()}

In [114]:
# WARNING : this function has a bug when oov_char is None
# source : https://raw.githubusercontent.com/fchollet/keras/master/keras/datasets/reuters.py
(x_train, labels_train), (x_test, labels_test) = reuters.load_data(path="reuters.pkl",
                                                         nb_words=None,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=None,
                                                         oov_char=0,
                                                         index_from=0)

In [103]:
len(x_train)+len(x_test)

11228

In [104]:
lens = [len(x) for x in x_train]
min(lens), sum(lens)/len(lens), max(lens)

(12, 144.5398574927633, 2375)

In [127]:
from collections import Counter
print(min(labels_train), max(labels_train))
print(Counter(labels_train))

0 45
Counter({3: 3159, 4: 1949, 19: 549, 16: 444, 1: 432, 11: 390, 20: 269, 13: 172, 8: 139, 10: 124, 9: 101, 21: 100, 25: 92, 2: 74, 18: 66, 24: 62, 0: 55, 34: 50, 12: 49, 36: 49, 6: 48, 28: 48, 30: 45, 23: 41, 17: 39, 31: 39, 40: 36, 32: 32, 41: 30, 14: 26, 26: 24, 39: 24, 43: 21, 15: 20, 29: 19, 37: 19, 38: 19, 45: 18, 5: 17, 7: 16, 22: 15, 27: 15, 42: 13, 44: 12, 33: 11, 35: 10})


In [106]:
wrds = [idx2word[n] for n in x_train[0]]
' '.join(wrds)

'mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

In [153]:
wrds = [idx2word[n] for n in x_train[50]]
' '.join(wrds)

"warner communications inc said its warner communications investors inc unit acquired stock and warrants representing 416 668 shares of berkey inc common stock or the equivalent of 8 2 pct of the company's common stock outstanding in a filing with the securities and exchange commission warner communications investors said it paid about one mln dlrs to berkey on september 23 to acquire 104 167 shares of berkey series b convertible preferred stock the preferred stock is convertible into 208 334 berkey common shares and warrants to buy another 208 334 shares warner communications and its subsidiary said the berkey stock is held as an investment which they will review and evaluate from time to time reuter 3"

### Model 1

In [117]:
vocab_size = 10000
wvect_dims = 32
maxlen = 500

In [118]:
def paddedset(orig_set):
    adj_set = [[min(n,vocab_size-1) for n in sq] for sq in orig_set]
    return pad_sequences(adj_set, maxlen=maxlen, dtype='int32', padding='pre', truncating='post', value=0)

In [119]:
x_train_pad = paddedset(x_train)
x_test_pad = paddedset(x_test)

In [120]:
print(len(x_train[0]))
print(x_train[0][0:8])
print([n for n in x_train_pad[0] if n>0][0:8])

86
[27592, 28839, 5, 40, 7, 444, 2, 22]
[9999, 9999, 5, 40, 7, 444, 2, 22]


In [145]:
model1 = Sequential([
        Embedding(vocab_size, wvect_dims, input_length=maxlen),
        Flatten(),
        Dense(100, activation='relu'),
        Dropout(0.7),
        Dense(46, activation='softmax')
    ])
model1.compile(optimizer=Adam(), loss='categorical_crossentropy') # WARNING : metrics=['accuracy'] fails

In [146]:
model1.fit(x_train_pad, to_categorical(labels_train), batch_size=100, nb_epoch=10,
           validation_data=(x_test_pad, to_categorical(labels_test)))

Train on 8982 samples, validate on 2246 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7feb4f103450>

### Model 2

In [147]:
model2 = Sequential([
        Embedding(vocab_size, wvect_dims, input_length=maxlen, dropout=0.2),
        Dropout(0.2),
        Convolution1D(64, 5, border_mode='same', activation='relu'),
        Dropout(0.2),
        MaxPooling1D(),
        Flatten(),
        Dense(100, activation='relu'),
        Dropout(0.7),
        Dense(46, activation='softmax')
    ])
model2.compile(optimizer=Adam(), loss='categorical_crossentropy') # WARNING : metrics=['accuracy'] fails

In [148]:
model2.fit(x_train_pad, to_categorical(labels_train),
           batch_size=100, nb_epoch=10,
           validation_data=(x_test_pad, to_categorical(labels_test)))

Train on 8982 samples, validate on 2246 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7feb4d944f90>