In [1]:
import numpy as np
import theano
import theano.tensor as T
import time

import lasagne as L

import sys
sys.path.insert(0, '../rnn_ex/')

from SimpleRNNLM import iterate_minibatches
from SampledSoftmaxLayer import SampledSoftmaxDenseLayer
from mt_load import load_mt, get_mt_voc, get_w2v_embs

from ShiftLayer import ShiftLayer
from L2PoolingLayer import L2PoolingLayer

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 780 (CNMeM is enabled with initial size: 30.0% of memory, cuDNN 5105)


In [2]:
# remember, now the pad value is the same as the <utt_end> token

pad_value = -1 # <utt_end>'s vector is the last one

In [3]:
mt_path = "/pio/data/data/mtriples/"
# mt_path = "/home/maciek/Desktop/mgr/DATA/MovieTriples_Dataset/"

train, valid, test = load_mt(path=mt_path, split=True, trim=200)
idx_to_w, w_to_idx, voc_size, freqs = get_mt_voc(path=mt_path, train_len=len(train))

In [4]:
word2vec_embs, word2vec_embs_mask = get_w2v_embs(path=mt_path)

w2v_train_mask = np.where(word2vec_embs_mask[:,0] == 1)[0]

In [13]:
def build_hred(input_var, mask_input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, out_emb_size,
               num_sampled, ssoft_probs=None, emb_init=None, train_emb=True, target_var=None):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)
    
    l_mask = None
    if mask_input_var is not None:
        l_mask = L.layers.InputLayer(shape=(None, None), input_var=mask_input_var)
    
    if emb_init is None:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size,  # not voc_size+1, because pad_value = <utt_end>
                                        output_size=emb_size)
    else:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size,
                                        output_size=emb_size,
                                        W=emb_init)
        if not train_emb:
            l_emb.params[l_emb.W].remove('trainable')
            
    l_lv1_enc_forw = L.layers.GRULayer(l_emb, # we process all utts in parallel, out_shape is batch_size x lv1_rec_size
                                       num_units=lv1_rec_size,
                                       grad_clipping=100,
#                                        only_return_final=True,
                                       mask_input=l_mask)
    
    ###
    
    l_lv1_enc_back = L.layers.GRULayer(l_emb, # backward pass of encoder rnn, out_shape is batch_size x lv1_rec_size
                                       num_units=lv1_rec_size,
                                       grad_clipping=100,
#                                        only_return_final=True,
                                       mask_input=l_mask,
                                       backwards=True)
    
    l2_pooled_forw = L2PoolingLayer(l_lv1_enc_forw)
    l2_pooled_back = L2PoolingLayer(l_lv1_enc_back)

    l_lv1_enc = L.layers.ConcatLayer([l2_pooled_forw, l2_pooled_back], axis=1) # concatenation of L2-pooled states
    
    ###    
    
    l_resh = L.layers.ReshapeLayer(l_lv1_enc, shape=(-1, 3, 2*lv1_rec_size)) # 3 is because movie *triples*
    
    l_lv2_enc = L.layers.GRULayer(l_resh, # out_shape is batch_size/3 x 3 x lv2_rec_size
                                  num_units=lv2_rec_size,
                                  grad_clipping=100)
    
    l_shift = ShiftLayer(l_lv2_enc)

    l_resh2 = L.layers.ReshapeLayer(l_shift, shape=(-1, lv2_rec_size))
    
    l_dec_inits = L.layers.DenseLayer(l_resh2, # out_shape is batch_size x lv1_rec_size
                                      num_units=lv1_rec_size,
                                      nonlinearity=L.nonlinearities.tanh)
    
    l_dec = L.layers.GRULayer(l_emb, # out_shape is batch_size x seq_len x lv1_rec_size
                              num_units=lv1_rec_size,
                              grad_clipping=100,
                              mask_input=l_mask,
                              hid_init=l_dec_inits)
    
    l_resh3 = L.layers.ReshapeLayer(l_dec, shape=(-1, lv1_rec_size))
    
    l_H0 = L.layers.DenseLayer(l_resh3,
                               num_units=out_emb_size,
                               nonlinearity=None)
    
    l_resh4 = L.layers.ReshapeLayer(l_emb, shape=(-1, emb_size))
    
    l_E0 = L.layers.DenseLayer(l_resh4,
                               num_units=out_emb_size,
                               b=None,
                               nonlinearity=None)
    
    l_soft_in = L.layers.ElemwiseSumLayer([l_H0, l_E0])
    
#     l_soft = L.layers.DenseLayer(l_soft_in,
#                                  num_units=voc_size,
#                                  nonlinearity=L.nonlinearities.softmax)
    
#     l_out = L.layers.ReshapeLayer(l_soft, shape=(input_var.shape[0], input_var.shape[1], voc_size))

    if target_var is not None:
        target_var = target_var.ravel()

    l_ssoft = SampledSoftmaxDenseLayer(l_soft_in, num_sampled, voc_size,
                                       targets=target_var,
                                       probs=ssoft_probs,
                                       sample_unique=False)

    if target_var is not None:
        l_out = L.layers.ReshapeLayer(l_ssoft, shape=(input_var.shape[0], input_var.shape[1]))
    else:
        l_out = L.layers.ReshapeLayer(l_ssoft, shape=(input_var.shape[0], input_var.shape[1], voc_size))

    return l_out

In [14]:
emb_size = 300
lv1_rec_size = 300
lv2_rec_size = 300
out_emb_size = 300

update_fn = lambda l, p: L.updates.adagrad(l, p, learning_rate=.01)

In [16]:
# for ssoft

num_sampled = 200

input_var = T.imatrix('inputs')
target_var = T.imatrix('targets')  # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')
mask_idx = mask_input_var.nonzero()

net = build_hred(input_var, mask_input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, out_emb_size,
                 num_sampled, ssoft_probs=freqs, emb_init=word2vec_embs, target_var=target_var)

train_out = L.layers.get_output(net)
test_out = L.layers.get_output(net, deterministic=True)

train_loss = -T.log(train_out[mask_idx]).mean()
test_loss = -T.log(test_out[mask_idx]).mean()

params = L.layers.get_all_params(net, trainable=True)
updates = update_fn(train_loss, params)

print 'Compiling theano functions...'

train_fn = theano.function([input_var, target_var, mask_input_var], train_loss, updates=updates)
val_fn = theano.function([input_var, target_var, mask_input_var], test_loss)

Compiling theano functions...


In [7]:
input_var = T.imatrix('inputs')
target_var = T.imatrix('targets')  # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')
mask_idx = mask_input_var.nonzero()

net = build_hred(input_var, mask_input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, out_emb_size,
                 emb_init=word2vec_embs)

train_out = L.layers.get_output(net)
test_out = L.layers.get_output(net, deterministic=True)

train_loss = L.objectives.categorical_crossentropy(train_out[mask_idx], target_var[mask_idx]).mean()
test_loss = L.objectives.categorical_crossentropy(test_out[mask_idx], target_var[mask_idx]).mean()

params = L.layers.get_all_params(net, trainable=True)
updates = update_fn(train_loss, params)

print 'Compiling theano functions...'

train_fn = theano.function([input_var, target_var, mask_input_var], train_loss, updates=updates)
val_fn = theano.function([input_var, target_var, mask_input_var], test_loss)

Compiling theano functions...


In [7]:
def train_one_epoch(train_data, batch_size, log_interval=10):
    train_err = 0.
    train_batches = 0
    num_training_words = 0
    start_time = time.time()

    for batch in iterate_minibatches(train_data, batch_size, pad_value):
        inputs, targets, mask = batch

        num_batch_words = mask.sum()
        train_err += train_fn(inputs, targets, mask) * num_batch_words
        train_batches += 1
        num_training_words += num_batch_words

        if not train_batches % log_interval:
            print "Done {} batches in {:.2f}s\ttraining loss:\t{:.6f}".format(
                train_batches, time.time() - start_time, train_err / num_training_words)

    return  train_err / num_training_words

In [8]:
# train, 1 dir, 1 epoch: 3.485554076321884
# val: 3.455356876018342

# train, 2 dir, concat, 1 epoch: 3.4864403798772239
# val: 3.4579001751897063

# train, 2 dir, L2 + concat, 1 epoch: 3.4881669768474675
# val: 3.4584704095551695
# training time: ~4700s

'''with sampled softmax'''
# train, 2 dir, L2 + concat, 1 epoch: 3.486180601246621
# val: 3.4811877499289308
# training time: ~2300s

In [17]:
train_one_epoch(train, 30)

Done 10 batches in 1.18s	training loss:	7.237148
Done 20 batches in 2.22s	training loss:	6.233935
Done 30 batches in 3.49s	training loss:	5.808175
Done 40 batches in 4.61s	training loss:	5.549267
Done 50 batches in 5.72s	training loss:	5.338073
Done 60 batches in 7.29s	training loss:	5.177397
Done 70 batches in 8.45s	training loss:	5.069123
Done 80 batches in 9.60s	training loss:	4.967996
Done 90 batches in 10.89s	training loss:	4.892198
Done 100 batches in 12.12s	training loss:	4.827950
Done 110 batches in 13.00s	training loss:	4.769821
Done 120 batches in 14.28s	training loss:	4.724219
Done 130 batches in 15.37s	training loss:	4.678354
Done 140 batches in 16.57s	training loss:	4.638821
Done 150 batches in 17.73s	training loss:	4.606170
Done 160 batches in 19.02s	training loss:	4.572973
Done 170 batches in 19.94s	training loss:	4.545257
Done 180 batches in 21.09s	training loss:	4.523121
Done 190 batches in 22.22s	training loss:	4.499238
Done 200 batches in 23.31s	training loss:	4.4779

3.486180601246621

In [18]:
def validate(val_data, batch_size):
    val_err = 0.
    val_batches = 0
    num_validate_words = 0
    start_time = time.time()

    for batch in iterate_minibatches(val_data, batch_size, pad_value):
        inputs, targets, mask = batch

        num_batch_words = mask.sum()
        val_err += val_fn(inputs, targets, mask) * num_batch_words
        val_batches += 1
        num_validate_words += num_batch_words

        if not val_batches % 100:
            print "Done {} batches in {:.2f}s".format(val_batches, time.time() - start_time)

    return val_err / num_validate_words

In [19]:
validate(valid, 30)

Done 100 batches in 4.54s
Done 200 batches in 9.34s
Done 300 batches in 14.35s
Done 400 batches in 18.75s
Done 500 batches in 23.27s
Done 600 batches in 27.62s
Done 700 batches in 32.04s
Done 800 batches in 36.39s
Done 900 batches in 41.09s
Done 1000 batches in 46.03s
Done 1100 batches in 50.74s
Done 1200 batches in 55.62s
Done 1300 batches in 60.65s
Done 1400 batches in 65.14s
Done 1500 batches in 69.61s
Done 1600 batches in 74.18s
Done 1700 batches in 78.60s
Done 1800 batches in 83.59s
Done 1900 batches in 88.52s
Done 2000 batches in 92.90s
Done 2100 batches in 97.71s
Done 2200 batches in 102.53s
Done 2300 batches in 107.51s
Done 2400 batches in 112.12s


3.4811877499289308

In [10]:
def build_context_net(input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, context_init, params):
    l_in = L.layers.InputLayer(shape=(1,None), input_var=input_var)
    
    l_emb = L.layers.EmbeddingLayer(l_in,
                                    input_size=voc_size,  # not voc_size+1, because pad_value = <utt_end>
                                    output_size=emb_size)
            
    l_lv1_enc_forw = L.layers.GRULayer(l_emb,
                                       num_units=lv1_rec_size,
                                       grad_clipping=100)
#                                        only_return_final=True)
    
    ###
    
    l_lv1_enc_back = L.layers.GRULayer(l_emb, # backward pass of encoder rnn
                                       num_units=lv1_rec_size,
                                       grad_clipping=100,
#                                        only_return_final=True,
                                       backwards=True)
    
    l2_pooled_forw = L2PoolingLayer(l_lv1_enc_forw)
    l2_pooled_back = L2PoolingLayer(l_lv1_enc_back)

    l_lv1_enc = L.layers.ConcatLayer([l2_pooled_forw, l2_pooled_back]) # concatenation of L2-pooled states
    
    ###    
    
    l_resh = L.layers.ReshapeLayer(l_lv1_enc, shape=(1, 1, 2*lv1_rec_size))
    
    l_lv2_enc = L.layers.GRULayer(l_resh,
                                  num_units=lv2_rec_size,
                                  hid_init=context_init,
                                  grad_clipping=100,
                                  only_return_final=True)

    return l_lv2_enc

In [11]:
'''
dekoder powinien czytac po jednym slowie, a nie cala sekwencje naraz.
tak jak context_net, bedzie mozna go zainicjowac poprzednim stanem.
'''

def build_decoder_net(input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, out_emb_size, 
                      context_init, params):
    l_in = L.layers.InputLayer(shape=(1,None), input_var=input_var)
    
    l_emb = L.layers.EmbeddingLayer(l_in,
                                    input_size=voc_size,  # not voc_size+1, because pad_value = <utt_end>
                                    output_size=emb_size)
    
    l_context_init = L.layers.InputLayer(shape=(1, lv2_rec_size), input_var=context_init)
    
    l_dec_inits = L.layers.DenseLayer(l_context_init,
                                      num_units=lv1_rec_size,
                                      nonlinearity=L.nonlinearities.tanh)
    
    l_dec = L.layers.GRULayer(l_emb,
                              num_units=lv1_rec_size,
                              grad_clipping=100,
                              hid_init=l_dec_inits)
    
    l_resh = L.layers.ReshapeLayer(l_dec, shape=(-1, lv1_rec_size))
    
    l_H0 = L.layers.DenseLayer(l_resh,
                               num_units=out_emb_size,
                               nonlinearity=None)
    
    l_resh2 = L.layers.ReshapeLayer(l_emb, shape=(-1, emb_size))
    
    l_E0 = L.layers.DenseLayer(l_resh2,
                               num_units=out_emb_size,
                               b=None,
                               nonlinearity=None)
    
    l_soft_in = L.layers.ElemwiseSumLayer([l_H0, l_E0])
    
    l_soft = L.layers.DenseLayer(l_soft_in,
                                 num_units=voc_size,
                                 nonlinearity=L.nonlinearities.softmax)
    
    l_out = L.layers.ReshapeLayer(l_soft, shape=(input_var.shape[0], input_var.shape[1], voc_size))

    return l_out

In [12]:
input_var = T.imatrix('inputs')
context_init = T.matrix('context_init')

context_net = build_context_net(input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, 
                                context_init, params=None)

context_out = L.layers.get_output(context_net, deterministic=True)

dec_net = build_decoder_net(input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, out_emb_size, 
                            context_init, params=None)

dec_out = L.layers.get_output(dec_net, deterministic=True)

print 'Compiling theano functions...'

get_probs = theano.function([input_var, context_init], dec_out)

get_context_output = theano.function([input_var, context_init], context_out)

In [None]:
con_init = np.zeros((1, lv2_rec_size), dtype=np.float32)

In [None]:
get_probs(np.array(train[0]).astype(np.int32)[np.newaxis, :], con_init).argmax(axis=2)

In [None]:
con_init = get_context_output(np.array(train[0]).astype(np.int32)[np.newaxis, :], con_init)
con_init