In [21]:
import numpy as np
import theano
import theano.tensor as T
import time

import lasagne as L
from itertools import chain

import sys
sys.path.insert(0, '../rnn_ex/')

from SimpleRNNLM import iterate_minibatches
from SampledSoftmaxLayer import SampledSoftmaxDenseLayer

In [2]:
# remember, now the pad value is the same as the <utt_end> token

pad_value = -1 # <utt_end>'s vector is the last one

def split_utt(utt):
    u1, u2, u3 = [i for i,j in enumerate(utt) if j == 1]
    return [utt[:u2], utt[u2:u3], utt[u3:]]

In [5]:
mt_path = "/pio/data/data/mtriples/"
mt_path = "/home/maciek/Desktop/mgr/DATA/MovieTriples_Dataset/"

def load_mt(path=mt_path):
    tr = np.load(mt_path + 'Training.triples.pkl')
    vl = np.load(mt_path + 'Validation.triples.pkl')
    ts = np.load(mt_path + 'Test.triples.pkl')
    
    tr = chain(*map(split_utt, tr))
    vl = chain(*map(split_utt, vl))
    ts = chain(*map(split_utt, ts))
    
    return tr, vl, ts

train, valid, test = load_mt()

train = [utt for utt in train if len(utt) < 200]
valid = [utt for utt in valid if len(utt) < 200]
test  = [utt for utt in test  if len(utt) < 200]


def get_mt_voc(mt_path=mt_path, train_len=len(train)):
    word_list = np.load(mt_path + 'Training.dict.pkl')
    word_list.sort(key=lambda x: x[1])
    freqs = np.array(map(lambda x: x[2], word_list) + [train_len])
    total_count = float(sum(freqs))
    
    words = map(lambda x: x[:2], word_list)
    
    w_to_idx = dict(words)
    w_to_idx['<utt_end>'] = pad_value
    idx_to_w = {v : k for (k,v) in w_to_idx.items()}
    
    return idx_to_w, w_to_idx, len(w_to_idx), freqs / total_count

idx_to_w, w_to_idx, voc_size, freqs = get_mt_voc()

In [7]:
word2vec_embs, word2vec_embs_mask = np.load(mt_path + 'Word2Vec_WordEmb.pkl')
word2vec_embs = np.vstack([word2vec_embs, L.init.GlorotUniform()((1,300))]).astype(np.float32)
word2vec_embs_mask = np.vstack([word2vec_embs_mask, np.ones((1,300))])

w2v_train_mask = np.where(word2vec_embs_mask[:,0] == 1)[0]

In [22]:
def build_hred(input_var, mask_input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, out_emb_size,
               emb_init=None, train_emb=True):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)
    
    l_mask = None
    if mask_input_var is not None:
        l_mask = L.layers.InputLayer(shape=(None, None), input_var=mask_input_var)
    
    if emb_init is None:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size,  # not voc_size+1, because pad_value = <utt_end>
                                        output_size=emb_size)
    else:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size,
                                        output_size=emb_size,
                                        W=emb_init)
        if not train_emb:
            l_emb.params[l_emb.W].remove('trainable')
            
    l_lv1_enc = L.layers.GRULayer(l_emb, # we process all utts in parallel, out_shape is batch_size x lv1_rec_size
                                  num_units=lv1_rec_size,
                                  grad_clipping=100,
                                  mask_input=l_mask,
                                  only_return_final=True)
    
#     l_lv1_enc_back = L.layers.GRULayer(l_emb,
#                                        num_units=lv1_rec_size,
#                                        grad_clipping=100,
#                                        mask_input=l_mask,
#                                        only_return_final=True,
#                                        backwards=True)
    
    l_resh = L.layers.ReshapeLayer(l_lv1_enc, shape=(-1, 3, lv1_rec_size)) # 3 is because movie *triples*
    
    l_lv2_enc = L.layers.GRULayer(l_resh, # out_shape is batch_size/3 x 3 x lv2_rec_size (has to be shifted!)
                                  num_units=lv2_rec_size,
                                  grad_clipping=100)
    
#     l_zeros = L.layers.InputLayer(shape=())

    l_resh2 = L.layers.ReshapeLayer(l_lv2_enc, shape=(-1, lv2_rec_size))
    
    l_dec_inits = L.layers.DenseLayer(l_resh2, # out_shape is batch_size x lv1_rec_size
                                      num_units=lv1_rec_size,
                                      nonlinearity=L.nonlinearities.tanh)
    
    l_dec = L.layers.GRULayer(l_emb, # out_shape is batch_size x seq_len x lv1_rec_size
                              num_units=lv1_rec_size,
                              grad_clipping=100,
                              mask_input=l_mask,
                              hid_init=l_dec_inits)
    
    l_resh3 = L.layers.ReshapeLayer(l_dec, shape=(-1, lv1_rec_size))
    
    l_H0 = L.layers.DenseLayer(l_resh3,
                               num_units=out_emb_size,
                               nonlinearity=None)
    
    l_resh4 = L.layers.ReshapeLayer(l_emb, shape=(-1, emb_size))
    
    l_E0 = L.layers.DenseLayer(l_resh4,
                               num_units=out_emb_size,
                               b=None,
                               nonlinearity=None)
    
    l_soft_in = L.layers.ElemwiseSumLayer([l_H0, l_E0])
    
    l_soft = L.layers.DenseLayer(l_soft_in,
                                 num_units=voc_size,
                                 nonlinearity=L.nonlinearities.softmax)
    
    l_out = L.layers.ReshapeLayer(l_soft, shape=(input_var.shape[0], input_var.shape[1], voc_size))

    return l_out

In [25]:
emb_size = 300
lv1_rec_size = 300
lv2_rec_size = 300
out_emb_size = 300

update_fn = lambda l, p: L.updates.adagrad(l, p, learning_rate=.01)

In [27]:
input_var = T.imatrix('inputs')
target_var = T.imatrix('targets')  # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')
mask_idx = mask_input_var.nonzero()

net = build_hred(input_var, mask_input_var, voc_size, emb_size, lv1_rec_size, lv2_rec_size, out_emb_size,
                 emb_init=word2vec_embs)

train_out = L.layers.get_output(net)
test_out = L.layers.get_output(net, deterministic=True)

train_loss = L.objectives.categorical_crossentropy(train_out[mask_idx], target_var[mask_idx]).mean()
test_loss = L.objectives.categorical_crossentropy(test_out[mask_idx], target_var[mask_idx]).mean()

params = L.layers.get_all_params(net, trainable=True)
updates = update_fn(train_loss, params)

print 'Compiling theano functions...'

train_fn = theano.function([input_var, target_var, mask_input_var], train_loss, updates=updates)
val_fn = theano.function([input_var, target_var, mask_input_var], test_loss)

Compiling theano functions...
