In [1]:
from keras.models import Model
from keras.layers import LSTM, Input, Dense, Activation, Add, Reshape, Lambda, Concatenate, \
                         TimeDistributed, Bidirectional
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [5]:
from corpus import *
from sklearn.utils import shuffle

In [6]:
corpus = MTCorpus()

In [7]:
# X, Y = shuffle(corpus.documents, corpus.links, random_state=0)
# X_train, Y_train = X[:100], Y[:100]
# X_test, Y_test = X[100:], Y[100:]

# Y_train = [to_categorical(np.array(y)-1, num_classes=len(y)) for y in Y_train]
# Y_test = [to_categorical(np.array(y)-1, num_classes=len(y)) for y in Y_test]

In [8]:
X, Y = shuffle(corpus.documents, corpus.links, random_state=0)
X, Y = pad_sequences(X), pad_sequences(Y)
X_train, Y_train = X[:100], Y[:100]
X_test, Y_test = X[100:], Y[100:]

# TODO: fix categorical, padding and labels starting from 1
Y_train, Y_test = np.array([to_categorical(y, num_classes=X.shape[1]) for y in Y_train]), np.array([to_categorical(y, num_classes=Y.shape[1]) for y in Y_test])

print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(100, 10, 300) (100, 10, 10) (12, 10, 300) (12, 10, 10)


In [9]:
hidden_size = 512
seq_len = X.shape[1]
nb_epochs = 100
learning_rate = 0.1
batch_size = 5

In [39]:
from keras.models import Model

inp = Input(shape=(seq_len,300), name='input')
encoder = Bidirectional(LSTM(hidden_size//2,return_sequences=True, name='encoder'))(inp)
decoder = LSTM(hidden_size,return_sequences=True, name='decoder')(encoder)

E = TimeDistributed(Dense(hidden_size, use_bias=False), name='E')(encoder)
DD = Lambda(lambda x: K.repeat_elements(K.expand_dims(x, 2), seq_len, 2))(decoder)
DD = TimeDistributed(Dense(hidden_size, use_bias=False), name='D')(DD)

add = Add(name='W1E_W2Di')
tanh = Activation('tanh', name='tanh')
vt = Dense(1, use_bias=False, name='vT')
softmax = Activation('softmax', name='softmax')

attention = add([E,DD])
attention = tanh(attention)
attention = vt(attention)
attention = Lambda(lambda x: K.squeeze(x, -1))(attention)
attention = softmax(attention)


model = Model(inputs=inp, outputs=attention)

In [40]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 10, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_9 (Bidirectional) (None, 10, 512)      1140736     input[0][0]                      
__________________________________________________________________________________________________
decoder (LSTM)                  (None, 10, 512)      2099200     bidirectional_9[0][0]            
__________________________________________________________________________________________________
lambda_15 (Lambda)              (None, 10, 10, 512)  0           decoder[0][0]                    
__________________________________________________________________________________________________
E (TimeDis

In [41]:
plot_model(model, to_file='model_withoutloop.png')

In [42]:
adam = Adam()
tensorboad = TensorBoard()

In [60]:
print("building model...")

model.compile(optimizer=adam,
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])

history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test),
                    epochs=50, batch_size=16, verbose=2,
                    callbacks=[tensorboad])

building model...
Train on 100 samples, validate on 12 samples
Epoch 1/50
 - 2s - loss: 1.7533 - categorical_accuracy: 0.4700 - val_loss: 1.4197 - val_categorical_accuracy: 0.5000
Epoch 2/50
 - 1s - loss: 1.3093 - categorical_accuracy: 0.4840 - val_loss: 1.3065 - val_categorical_accuracy: 0.5000
Epoch 3/50
 - 1s - loss: 1.3626 - categorical_accuracy: 0.4840 - val_loss: 1.1695 - val_categorical_accuracy: 0.5000
Epoch 4/50
 - 1s - loss: 1.2506 - categorical_accuracy: 0.4840 - val_loss: 1.0039 - val_categorical_accuracy: 0.5000
Epoch 5/50
 - 1s - loss: 1.1806 - categorical_accuracy: 0.4840 - val_loss: 1.0200 - val_categorical_accuracy: 0.5000
Epoch 6/50
 - 1s - loss: 1.1091 - categorical_accuracy: 0.4840 - val_loss: 0.9727 - val_categorical_accuracy: 0.5000
Epoch 7/50
 - 1s - loss: 1.0539 - categorical_accuracy: 0.4840 - val_loss: 0.9520 - val_categorical_accuracy: 0.5000
Epoch 8/50
 - 2s - loss: 1.0652 - categorical_accuracy: 0.4840 - val_loss: 0.9756 - val_categorical_accuracy: 0.5000
E

In [52]:
# categorical accuracy (this is a bit skewed since it looks at the each link. 
#                       Since most of them are 0 paddings, accuracy is much higher than it really is...)
model.evaluate(X_test, Y_test)



[0.72529667615890503, 0.73333334922790527]

In [51]:
# Real Accuracy... (i.e. ratio of test samples that are completely correct predicted)
sum([np.array_equal(p,y) for p,y in zip(np.argmax(model.predict(X_test),-1),Y_test.argmax(-1))]) / len(Y_test)

0.3333333333333333

In [50]:
# compare predictions on training set with training labels
list(zip(model.predict(X_train).argmax(-1), Y_train.argmax(-1)))

[(array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 1, 3, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 3, 3, 5, 5, 5])),
 (array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 1, 1, 4])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 3, 3, 3, 3, 3])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 5, 1, 2, 2, 5])),
 (array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 2, 1, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 1, 3, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 1, 1, 4])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 3, 3, 3, 5, 3])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 3, 3, 3, 3, 4])),
 (ar

In [58]:
list(zip(model.predict(X_test).argmax(-1), Y_test.argmax(-1)))

[(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 1, 1, 2, 1, 4])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 3, 3, 3, 3, 3])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 3, 3, 3, 2, 2])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 1, 3, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 2, 3, 4, 4, 4])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 5, 1, 5, 3, 5])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 5, 3, 5, 5, 5])),
 (array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]),
  array([0, 0, 0, 0, 0, 2, 5, 5, 5, 5]))]

In [59]:
inp = Input(shape=(seq_len,300), name='input')
encoder = Bidirectional(LSTM(hidden_size//2,return_sequences=True, name='encoder'))(inp)
decoder = LSTM(hidden_size,return_sequences=True, name='decoder')(encoder)

E = TimeDistributed(Dense(hidden_size, use_bias=False), name='E')(encoder)
D = TimeDistributed(Dense(hidden_size, use_bias=False), name='D')(decoder)


add = Add(name='W1E_W2Di')
tanh = Activation('tanh', name='tanh')
vt = Dense(1, use_bias=False, name='vT')
softmax = Activation('softmax', name='softmax')
Di = Lambda(lambda x: K.repeat_elements(K.expand_dims(x, 2), seq_len, 2))(D)

attention = add([E,Di])
attention = tanh(attention)
attention = vt(attention)
attention = Lambda(lambda x: K.squeeze(x, -1))(attention)
attention = softmax(attention)


model = Model(inputs=inp, outputs=attention)