In [78]:
from theano.sandbox import cuda

In [2]:
%matplotlib inline
from __future__ import division, print_function

In [3]:
import math, os
import numpy as np

In [77]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Activation, Reshape, Permute, merge
from keras.models import Sequential, Model
from keras.layers.merge import Concatenate
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers.convolutional import *
from keras.layers.pooling import MaxPooling1D
from keras.utils.np_utils import to_categorical

## Load Reuters dataset

In [5]:
model_path = '../data/colearn/models/'

In [11]:
from keras.datasets import imdb, reuters
idx = reuters.get_word_index()

In [12]:
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

[u'the', u'of', u'to', u'in', u'said', u'and', u'a', u'mln', u'3', u'for']

In [13]:
len(idx_arr)

30979

In [14]:
idx2word = {v: k for k, v in idx.iteritems()}

In [15]:
n = 5
(idx2word[n], idx[idx2word[n]])

(u'said', 5)

In [16]:
from keras.datasets import reuters

# WARNING : this function has a bug when oov_char is None
# source : https://raw.githubusercontent.com/fchollet/keras/master/keras/datasets/reuters.py
(x_train, labels_train), (x_test, labels_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=None,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=None,
                                                         oov_char=0,
                                                         index_from=0)

In [17]:
L = []
L.extend(labels_train)
L.extend(labels_test)
nb_categories = len(set(L))

In [18]:
# x_train contains lists of variable length were words are represented by indices
len(x_train[0]), len(x_train[1]), len(x_train[2])

(86, 55, 138)

In [19]:
wrds = [idx2word[n] for n in x_train[0]]
' '.join(wrds)

u'mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

In [20]:
vocab_size = 2500
maxlen = 500

In [21]:
def paddedset(orig_set):
    adj_set = [[min(n,vocab_size-1) for n in sq] for sq in orig_set]
    return pad_sequences(adj_set, maxlen=maxlen, dtype='int32', padding='pre', truncating='post', value=0)

In [22]:
x_train_pad = paddedset(x_train)
x_test_pad = paddedset(x_test)

In [23]:
print(len(x_train[0]))
print(x_train[0][0:8])
print([n for n in x_train_pad[0] if n>0][0:8])

86
[27592, 28839, 5, 40, 7, 444, 2, 22]
[2499, 2499, 5, 40, 7, 444, 2, 22]


## Turn Reuters dataset into CBOW 4-grams
### Remember that CBOW is an unsupervised semantic model

In [45]:
cbow_lag = 2 # (lag, _, lag) => t

def f(inputs=x_train, size=1000):
    outputs = []
    for sentence in x_train[0:size]:
        for i in range(cbow_lag, len(sentence)-cbow_lag):
            outputs.append( (sentence[i-cbow_lag:i] + sentence[i+1:i+cbow_lag+1], sentence[i]) )
    return outputs

In [47]:
cbow_train = f(x_train, size=5000)
len(cbow_train)

709702

In [49]:
cbow_test = f(x_test, size=2000)
len(cbow_test)

289222

In [74]:
np.random.shuffle(cbow_train)
cbow_x_train = [x for x, y in cbow_train if max(x)<vocab_size and y<vocab_size]
cbow_labels_train = [y for x, y in cbow_train if max(x)<vocab_size and y<vocab_size]
print(len(cbow_x_train))

np.random.shuffle(cbow_test)
cbow_x_test = [x for x, y in cbow_test if max(x)<vocab_size and y<vocab_size]
cbow_labels_test = [y for x, y in cbow_test if max(x)<vocab_size and y<vocab_size]

391752


## Load GloVe embedding pre-trained weights
#### Command to use :
wget http://nlp.stanford.edu/data/glove.6B.zip

In [52]:
GLOVE_DIR = '../data/glove6B/'
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [53]:
k = embeddings_index.keys()[0]
v = embeddings_index[k]
print(k, v.shape)

biennials (100,)


In [54]:
newwords = embeddings_index.keys()
count = 0
for i in range(1,vocab_size):
    if not(idx2word[i] in newwords):
        count+=1
        #print(idx2word[i])
count



41

In [55]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
#for word, i in idx2word[i]:#word_index.items():
for i in range(1,vocab_size):
    word = idx2word[i]
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [101]:
pretrained = True # use Glove weights or not

In [168]:
# Turning words into embedded vectors
#    Documentation : https://keras.io/layers/embeddings/
#    The model will take as Input an integer matrix of size (batch_size, n_words).
#    The largest integer (i.e. word index) in the input should be no larger than vocab_size.
#    Now model.output_shape == (None, n_words, n_embed), where None is the batch dimension.
if pretrained:
    EMBEDDING = Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix]) # WARNING : input_length ?
else:
    EMBEDDING = Embedding(vocab_size, EMBEDDING_DIM) # WARNING : input_length ?

### CBOW model to infer semantics

In [169]:
n_words = 2*cbow_lag # we use LAG words before and LAG words after as inputs
batch_size = 100

In [170]:
modelWRD = Sequential()

# First layer is a dummy-permutation = Identity to specify input shape
modelWRD.add( Permute((1,), input_shape=(n_words,)) ) # WARNING : axis 0 is the sample dim

modelWRD.add(EMBEDDING)

modelWRD.add(Lambda(lambda x : K.sum(x,axis=1), output_shape=(EMBEDDING_DIM,)))

modelWRD.add(Dense(vocab_size, input_shape=(EMBEDDING_DIM,), activation='softmax'))

In [171]:
modelWRD.predict(np.ones((5,n_words))).shape

(5, 2500)

In [172]:
modelWRD.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_10 (Permute)         (None, 4)                 0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 4, 100)            250000    
_________________________________________________________________
lambda_5 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 2500)              252500    
Total params: 502,500.0
Trainable params: 502,500.0
Non-trainable params: 0.0
_________________________________________________________________


#### First round of training : keeps embedding matrix fixed

In [173]:
EMBEDDING.trainable = False # WARNING : needs re-compiling to be effective
modelWRD.compile(optimizer=Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])

In [175]:
modelWRD.fit(cbow_x_train, to_categorical(cbow_labels_train,vocab_size),
             batch_size=64,
             epochs=3,
             shuffle=True)

Epoch 1/1


<keras.callbacks.History at 0x7f5e1f99e210>

#### Second round of training : tune the embedding matrix too

In [157]:
EMBEDDING.trainable = True # WARNING : needs re-compiling to be effective
modelWRD.compile(optimizer=Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])

In [133]:
#modelWRD.optimizer.lr = 1e-4
modelWRD.fit(cbow_x_train, to_categorical(cbow_labels_train,vocab_size),
             batch_size=64,
             epochs=1,
             shuffle=True)

Epoch 1/1


<keras.callbacks.History at 0x7f5e2583fb10>

### Prediction model of text category

In [134]:
EMBEDDING.trainable = False

### WARNING : fix the bug with MaxPooling

In [151]:
# Neural Network architecture defined here :
#     https://quid.com/feed/how-quid-uses-deep-learning-with-small-data
graph_in = Input(shape=(maxlen, EMBEDDING_DIM))

convs = []
for w in range(1,4): # the convolution window width
    conv = Conv1D(filters=300,
                  kernel_size=w,
                  padding='valid',
                  activation='relu',
                  strides=1)(graph_in)
    # keras.layers.pooling.MaxPooling1D(pool_size=2, strides=None, padding='valid')
    pool = conv#MaxPooling1D(pool_size=2)(conv)
    flatten = Flatten()(pool)
    convs.append(flatten)

out = Concatenate(axis=-1)(convs) # WARNING : check axis and dimension
graph = Model(inputs=graph_in, outputs=out)

In [152]:
modelTXT = Sequential()

# First layer is a dummy-permutation = Identity to specify input shape
modelTXT.add( Permute((1,), input_shape=(maxlen,)) ) # WARNING : axis 0 is the sample dim

if True:
    modelTXT.add(EMBEDDING)
else:
    # INFO : may not be necessary but cleared this way, otherwise shape in .summary is not clear
    EMBEDDING_TXT = Embedding(vocab_size, EMBEDDING_DIM, weights=EMBEDDING.get_weights())
    EMBEDDING_TXT.trainable = False
    modelTXT.add(EMBEDDING_TXT)

modelTXT.add(graph)

modelTXT.add(Dense(300))
modelTXT.add(Dropout(0.5))
modelTXT.add(Activation('relu'))
modelTXT.add(Dense(100))
modelTXT.add(Dropout(0.5))
modelTXT.add(Activation('relu'))
modelTXT.add(Dense(nb_categories))
modelTXT.add(Activation('softmax'))

In [153]:
modelTXT.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_8 (Permute)          (None, 500)               0         
_________________________________________________________________
embedding_3 (Embedding)      multiple                  250000    
_________________________________________________________________
model_3 (Model)              (None, 449100)            180900    
_________________________________________________________________
dense_16 (Dense)             (None, 300)               134730300 
_________________________________________________________________
dropout_9 (Dropout)          (None, 300)               0         
_________________________________________________________________
activation_13 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 100)               30100     
__________

In [180]:
modelTXT.compile(optimizer=Adam(), loss='categorical_crossentropy',metrics=['accuracy'])

In [181]:
modelTXT.fit(x_train_pad,
             to_categorical(labels_train, nb_categories),
             batch_size=100,
             #validation_data=(x_test_pad, labels_test),
             epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x7f5e19c81950>

In [182]:
modelTXT.predict(x_train_pad[0:5]).shape

(5, 46)