In [1]:
import numpy as np
import theano
import theano.tensor as T
import time

import lasagne as L

import sys
sys.path.insert(0, '../rnn_ex/')

from SimpleRNNLM import iterate_minibatches
from SampledSoftmaxLayer import SampledSoftmaxDenseLayer
from mt_load import load_mt, get_mt_voc, get_w2v_embs

from ShiftLayer import ShiftLayer
from L2PoolingLayer import L2PoolingLayer

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 780 (CNMeM is enabled with initial size: 30.0% of memory, cuDNN 5105)


In [2]:
# remember, now the pad value is the same as the <utt_end> token

pad_value = -1 # <utt_end>'s vector is the last one

In [3]:
mt_path = "/pio/data/data/mtriples/"
# mt_path = "/home/maciek/Desktop/mgr/DATA/MovieTriples_Dataset/"

train, valid, test = load_mt(path=mt_path, split=True, trim=200)
idx_to_w, w_to_idx, voc_size, freqs = get_mt_voc(path=mt_path, train_len=len(train))

In [4]:
word2vec_embs, word2vec_embs_mask = get_w2v_embs(path=mt_path)

w2v_train_mask = np.where(word2vec_embs_mask[:,0] == 1)[0]

In [5]:
def build_hred(input_var, mask_input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, out_emb_size,
               emb_init=None, train_emb=True):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)
    
    l_mask = None
    if mask_input_var is not None:
        l_mask = L.layers.InputLayer(shape=(None, None), input_var=mask_input_var)
    
    if emb_init is None:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size,  # not voc_size+1, because pad_value = <utt_end>
                                        output_size=emb_size)
    else:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size,
                                        output_size=emb_size,
                                        W=emb_init)
        if not train_emb:
            l_emb.params[l_emb.W].remove('trainable')
            
    l_lv1_enc_forw = L.layers.GRULayer(l_emb, # we process all utts in parallel, out_shape is batch_size x lv1_rec_size
                                       num_units=lv1_rec_size,
                                       grad_clipping=100,
#                                        only_return_final=True,
                                       mask_input=l_mask)
    
    ###
    
    l_lv1_enc_back = L.layers.GRULayer(l_emb, # backward pass of encoder rnn, out_shape is batch_size x lv1_rec_size
                                       num_units=lv1_rec_size,
                                       grad_clipping=100,
#                                        only_return_final=True,
                                       mask_input=l_mask,
                                       backwards=True)
    
    l2_pooled_forw = L2PoolingLayer(l_lv1_enc_forw)
    l2_pooled_back = L2PoolingLayer(l_lv1_enc_back)

    l_lv1_enc = L.layers.ConcatLayer([l2_pooled_forw, l2_pooled_back], axis=1) # concatenation of L2-pooled states
    
    ###    
    
    l_resh = L.layers.ReshapeLayer(l_lv1_enc, shape=(-1, 3, 2*lv1_rec_size)) # 3 is because movie *triples*
    
    l_lv2_enc = L.layers.GRULayer(l_resh, # out_shape is batch_size/3 x 3 x lv2_rec_size
                                  num_units=lv2_rec_size,
                                  grad_clipping=100)
    
    l_shift = ShiftLayer(l_lv2_enc)

    l_resh2 = L.layers.ReshapeLayer(l_shift, shape=(-1, lv2_rec_size))
    
    l_dec_inits = L.layers.DenseLayer(l_resh2, # out_shape is batch_size x lv1_rec_size
                                      num_units=lv1_rec_size,
                                      nonlinearity=L.nonlinearities.tanh)
    
    l_dec = L.layers.GRULayer(l_emb, # out_shape is batch_size x seq_len x lv1_rec_size
                              num_units=lv1_rec_size,
                              grad_clipping=100,
                              mask_input=l_mask,
                              hid_init=l_dec_inits)
    
    l_resh3 = L.layers.ReshapeLayer(l_dec, shape=(-1, lv1_rec_size))
    
    l_H0 = L.layers.DenseLayer(l_resh3,
                               num_units=out_emb_size,
                               nonlinearity=None)
    
    l_resh4 = L.layers.ReshapeLayer(l_emb, shape=(-1, emb_size))
    
    l_E0 = L.layers.DenseLayer(l_resh4,
                               num_units=out_emb_size,
                               b=None,
                               nonlinearity=None)
    
    l_soft_in = L.layers.ElemwiseSumLayer([l_H0, l_E0])
    
    l_soft = L.layers.DenseLayer(l_soft_in,
                                 num_units=voc_size,
                                 nonlinearity=L.nonlinearities.softmax)
    
    l_out = L.layers.ReshapeLayer(l_soft, shape=(input_var.shape[0], input_var.shape[1], voc_size))

    return l_out

In [6]:
emb_size = 300
lv1_rec_size = 300
lv2_rec_size = 300
out_emb_size = 300

update_fn = lambda l, p: L.updates.adagrad(l, p, learning_rate=.01)

In [7]:
input_var = T.imatrix('inputs')
target_var = T.imatrix('targets')  # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')
mask_idx = mask_input_var.nonzero()

net = build_hred(input_var, mask_input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, out_emb_size,
                 emb_init=word2vec_embs)

train_out = L.layers.get_output(net)
test_out = L.layers.get_output(net, deterministic=True)

train_loss = L.objectives.categorical_crossentropy(train_out[mask_idx], target_var[mask_idx]).mean()
test_loss = L.objectives.categorical_crossentropy(test_out[mask_idx], target_var[mask_idx]).mean()

params = L.layers.get_all_params(net, trainable=True)
updates = update_fn(train_loss, params)

print 'Compiling theano functions...'

train_fn = theano.function([input_var, target_var, mask_input_var], train_loss, updates=updates)
val_fn = theano.function([input_var, target_var, mask_input_var], test_loss)

Compiling theano functions...


In [8]:
def train_one_epoch(train_data, batch_size, log_interval=10):
    train_err = 0.
    train_batches = 0
    num_training_words = 0
    start_time = time.time()

    for batch in iterate_minibatches(train_data, batch_size, pad_value):
        inputs, targets, mask = batch

        num_batch_words = mask.sum()
        train_err += train_fn(inputs, targets, mask) * num_batch_words
        train_batches += 1
        num_training_words += num_batch_words

        if not train_batches % log_interval:
            print "Done {} batches in {:.2f}s\ttraining loss:\t{:.6f}".format(
                train_batches, time.time() - start_time, train_err / num_training_words)

    return  train_err / num_training_words

In [None]:
# train, 1 dir, 1 epoch: 3.485554076321884
# val: 3.455356876018342

# train, 2 dir, concat, 1 epoch: 3.4864403798772239
# val: 3.4579001751897063

# train, 2 dir, L2 + concat, 1 epoch: 3.4881669768474675
# val: 3.4584704095551695

In [9]:
train_one_epoch(train, 30)

Done 10 batches in 2.43s	training loss:	7.072288
Done 20 batches in 4.58s	training loss:	6.057662
Done 30 batches in 7.15s	training loss:	5.588586
Done 40 batches in 9.45s	training loss:	5.327152
Done 50 batches in 11.78s	training loss:	5.137194
Done 60 batches in 14.89s	training loss:	4.986716
Done 70 batches in 17.27s	training loss:	4.883437
Done 80 batches in 19.66s	training loss:	4.797529
Done 90 batches in 22.26s	training loss:	4.736309
Done 100 batches in 24.72s	training loss:	4.673691
Done 110 batches in 26.60s	training loss:	4.621513
Done 120 batches in 29.19s	training loss:	4.592721
Done 130 batches in 31.45s	training loss:	4.552453
Done 140 batches in 33.92s	training loss:	4.515785
Done 150 batches in 36.29s	training loss:	4.485981
Done 160 batches in 38.91s	training loss:	4.461513
Done 170 batches in 40.84s	training loss:	4.430324
Done 180 batches in 43.21s	training loss:	4.409443
Done 190 batches in 45.54s	training loss:	4.389150
Done 200 batches in 47.78s	training loss:	4.

3.4881669768474675

In [10]:
def validate(val_data, batch_size):
    val_err = 0.
    val_batches = 0
    num_validate_words = 0
    start_time = time.time()

    for batch in iterate_minibatches(val_data, batch_size, pad_value):
        inputs, targets, mask = batch

        num_batch_words = mask.sum()
        val_err += val_fn(inputs, targets, mask) * num_batch_words
        val_batches += 1
        num_validate_words += num_batch_words

        if not val_batches % 100:
            print "Done {} batches in {:.2f}s".format(val_batches, time.time() - start_time)

    return val_err / num_validate_words

In [11]:
validate(valid, 30)

Done 100 batches in 5.03s
Done 200 batches in 10.23s
Done 300 batches in 15.64s
Done 400 batches in 20.42s
Done 500 batches in 25.34s
Done 600 batches in 30.10s
Done 700 batches in 34.95s
Done 800 batches in 39.75s
Done 900 batches in 44.91s
Done 1000 batches in 50.30s
Done 1100 batches in 55.44s
Done 1200 batches in 60.77s
Done 1300 batches in 66.19s
Done 1400 batches in 71.12s
Done 1500 batches in 76.04s
Done 1600 batches in 80.98s
Done 1700 batches in 85.84s
Done 1800 batches in 91.27s
Done 1900 batches in 96.63s
Done 2000 batches in 101.43s
Done 2100 batches in 106.66s
Done 2200 batches in 111.88s
Done 2300 batches in 117.26s
Done 2400 batches in 122.27s


3.4584704095551695