In [1]:
import numpy as np
import theano
import theano.tensor as T
import time
import sys

import lasagne as L

sys.path.insert(0, '../HSoftmaxLayerLasagne/')

from HSoftmaxLayer import HierarchicalSoftmaxDenseLayer
from SampledSoftmaxLayer import SampledSoftmaxDenseLayer

Using gpu device 0: GeForce GTX 780 (CNMeM is enabled with initial size: 30.0% of memory, cuDNN 4007)
  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [2]:
mt_path = "/pio/data/data/mtriples/"

def get_mt_voc(path=mt_path):
    word_list = np.load(mt_path + 'Training.dict.pkl')
    word_list = map(lambda x: x[:2], word_list)
    wc = len(word_list)
    
    w_to_idx = dict(word_list)
    idx_to_w = {v : k for (k,v) in w_to_idx.items()}
    
    return idx_to_w, w_to_idx, wc

idx_to_w, w_to_idx, voc_size = get_mt_voc()


def load_mt(path=mt_path):
    tr = np.load(mt_path + 'Training.triples.pkl')
    vl = np.load(mt_path + 'Validation.triples.pkl')
    ts = np.load(mt_path + 'Test.triples.pkl')
    
    return tr, vl, ts

train, valid, test = load_mt()

In [None]:
word2vec_embs, word2vec_embs_mask = np.load(mt_path + 'Word2Vec_WordEmb.pkl')
word2vec_embs = np.vstack([word2vec_embs, L.init.GlorotUniform()((1,300))]).astype(np.float32)
word2vec_embs_mask = np.vstack([word2vec_embs_mask, np.zeros((1,300))])

In [3]:
# Similar to Lasagne mnist.py example, added input mask and different sequence lengths

def iterate_minibatches(inputs, batchsize, pad=-1):
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):        
        excerpt = slice(start_idx, start_idx + batchsize)
        inp = inputs[excerpt]
        
        inp_max_len = len(max(inp, key=len))
        inp = map(lambda l: l + [pad]*(inp_max_len-len(l)), inp)
        inp = np.asarray(inp, dtype=np.int32)
        tar = np.hstack([inp[:,1:], np.zeros((batchsize,1), dtype=np.int32) + pad])
        def not_pad(x):
            return x != pad
        v_not_pad = np.vectorize(not_pad, otypes=[np.float32])
        mask = v_not_pad(inp) # there is no separate value for the end of an utterance right now, just pad
        
        yield inp, tar, mask

In [9]:
def build_simple_rnnlm(input_var, mask_input_var, voc_size, emb_size, rec_size, emb_init=None):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)  
    batch_size, seq_len = l_in.input_var.shape
    
    l_mask = None
    if mask_input_var is not None:
        print 'setting up input mask...'
        l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    if emb_init is None:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size+1, 
                                        output_size=emb_size)
    else:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size+1, 
                                        output_size=emb_size,
                                        W=emb_init)
        l_emb.params[l_emb.W].remove('trainable')
    
    l_lstm1 = L.layers.LSTMLayer(l_emb,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
    
    l_lstm2 = L.layers.LSTMLayer(l_lstm1,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
    
    l_resh = L.layers.ReshapeLayer(l_lstm2, shape=(-1, rec_size))
    
    l_soft = L.layers.DenseLayer(l_resh,
                                num_units=voc_size,
                                nonlinearity=L.nonlinearities.softmax)
    
    l_out = L.layers.ReshapeLayer(l_soft, shape=(batch_size, seq_len, voc_size))
    
    return l_out

In [5]:
def build_hsoft_rnnlm(input_var, target_var, mask_input_var, voc_size, emb_size, rec_size):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)    
    batch_size, seq_len = l_in.input_var.shape
    l_mask = None
    if mask_input_var is not None:
        print 'setting up input mask...'
        l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    l_emb = L.layers.EmbeddingLayer(l_in,
                                    input_size=voc_size+1, 
                                    output_size=emb_size)
    
    l_lstm1 = L.layers.LSTMLayer(l_emb,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)    
    
#     l_lstm2 = L.layers.LSTMLayer(l_lstm1,
#                                  num_units=rec_size,
#                                  nonlinearity=L.nonlinearities.tanh,
#                                  grad_clipping=100,
#                                  mask_input=l_mask)
    
    l_resh = L.layers.ReshapeLayer(l_lstm1, shape=(-1, rec_size))
    
    # hierarchical softmax
    
    l_resh_tar = None
    if target_var is not None:
        print 'setting up targets for hsoftmax...'
        l_tar = L.layers.InputLayer(shape=(None, None), input_var=target_var)
        l_resh_tar = L.layers.ReshapeLayer(l_tar, shape=(-1, 1))
        
    l_hsoft = HierarchicalSoftmaxDenseLayer(l_resh,
                                            num_units=voc_size,
                                            target=l_resh_tar)
    l_out = None
    if target_var is not None:
        l_out = L.layers.ReshapeLayer(l_hsoft, shape=(batch_size, seq_len))
    else:
        l_out = L.layers.ReshapeLayer(l_hsoft, shape=(batch_size, seq_len, voc_size))
    
    return l_out

# 1 epoch on gpu with hsoft took about 700s, batch_size = 50

In [6]:
def build_sampledsoft_rnnlm(input_var, mask_input_var, num_sampled, voc_size, 
                            emb_size, rec_size, target_var=None, use_all_words=False):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)    
    batch_size, seq_len = l_in.input_var.shape
    l_mask = None
    if mask_input_var != None:
        print 'setting up input mask...'
        l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    l_emb = L.layers.EmbeddingLayer(l_in,
                                    input_size=voc_size+1, 
                                    output_size=emb_size)
    
    l_lstm1 = L.layers.LSTMLayer(l_emb,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
    
    l_lstm2 = L.layers.LSTMLayer(l_lstm1,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
      
    l_resh = L.layers.ReshapeLayer(l_lstm2, shape=(-1, rec_size))
    
    if target_var is not None:
        print 'setting up targets for sampled softmax...'
        target_var = target_var.ravel()
    
    l_ssoft = SampledSoftmaxDenseLayer(l_resh, num_sampled, voc_size, 
                                       targets=target_var, 
                                       use_all_words=use_all_words)
    
    if target_var is not None:
        l_out = L.layers.ReshapeLayer(l_ssoft, shape=(batch_size, seq_len))
    else:
        l_out = L.layers.ReshapeLayer(l_ssoft, shape=(batch_size, seq_len, voc_size))
    
    return l_out

In [11]:
emb_size = 300
rec_size = 300

In [8]:
def clone_param_values(net_from, net_to):
    L.layers.set_all_param_values(net_to, L.layers.get_all_param_values(net_from))

In [12]:
# full softmax test

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')
voc_mask = T.ivector('voc_mask')

net = build_simple_rnnlm(input_var, mask_input_var, voc_size, emb_size, rec_size, emb_init=word2vec_embs)
out = L.layers.get_output(net)

mask_idx = mask_input_var.nonzero()
loss = L.objectives.categorical_crossentropy(out[mask_idx], targets[mask_idx])
loss = loss.mean() # mean batch loss

params = L.layers.get_all_params(net, trainable=True)
updates = L.updates.adagrad(loss, params, learning_rate=.01)

train_fn = theano.function([input_var, targets, mask_input_var], loss, updates=updates)

### for validation

test_net = net # this line is just for compatibility later

test_out = L.layers.get_output(net, deterministic=True)
test_loss = L.objectives.categorical_crossentropy(test_out[mask_idx], targets[mask_idx])
test_loss = test_loss.mean()
# test_acc = T.mean(T.eq(T.argmax(test_out, axis=1), targets), dtype=theano.config.floatX)

val_fn = theano.function([input_var, targets, mask_input_var], test_loss)

setting up input mask...


In [10]:
# sampled softmax test (with targets!)

num_sampled = None

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')

net = build_sampledsoft_rnnlm(input_var, mask_input_var, num_sampled, voc_size, 
                              emb_size, rec_size, target_var=targets)
out = L.layers.get_output(net)

mask_idx = mask_input_var.nonzero()
loss = -T.sum(T.log(out[mask_idx])) / T.sum(mask_input_var)

params = L.layers.get_all_params(net, trainable=True)
updates = L.updates.adagrad(loss, params, learning_rate=.01)
# updates = L.updates.rmsprop(loss, params, learning_rate=.001, rho=.9, epsilon=1e-06)

train_fn = theano.function([input_var, targets, mask_input_var], loss, updates=updates)

### for validation

test_input_var = T.imatrix('inputs')
test_targets = T.imatrix('targets')
test_mask_input_var = T.matrix('input_mask')

test_net = build_sampledsoft_rnnlm(test_input_var, test_mask_input_var, num_sampled, voc_size, 
                                   emb_size, rec_size, target_var=test_targets, use_all_words=True)

test_mask_idx = test_mask_input_var.nonzero()

test_out = L.layers.get_output(test_net, deterministic=True)
test_loss = -T.sum(T.log(test_out[test_mask_idx])) / T.sum(test_mask_input_var)

# test_acc = T.mean(T.eq(T.argmax(test_out, axis=1), targets), dtype=theano.config.floatX)

val_fn = theano.function([test_input_var, test_targets, test_mask_input_var], test_loss)

setting up input mask...
setting up targets for sampled softmax...
setting up input mask...
setting up targets for sampled softmax...


In [20]:
# hierarchical softmax test

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')

net = build_hsoft_rnnlm(input_var, targets, mask_input_var, voc_size, emb_size, rec_size)
out = L.layers.get_output(net)

mask_idx = mask_input_var.nonzero()
loss = -T.sum(T.log(out[mask_idx])) / T.sum(mask_input_var)

params = L.layers.get_all_params(net, trainable=True)
#updates = L.updates.rmsprop(loss, params, learning_rate=.001, rho=.9, epsilon=1e-06)
updates = L.updates.adagrad(loss, params, learning_rate=.01)

train_fn = theano.function([input_var, targets, mask_input_var], loss, updates=updates)

#### for validation

test_net = net # this line is just for compatibility later

test_out = L.layers.get_output(net, deterministic=True)
test_loss = -T.sum(T.log(test_out[mask_idx])) / T.sum(mask_input_var)

#test_acc = T.mean(T.eq(T.argmax(test_out, axis=1), targets), dtype=theano.config.floatX)

val_fn = theano.function([input_var, targets, mask_input_var], test_loss)

setting up input mask...
setting up targets for hsoftmax...
setting up input mask...
setting up targets for hsoftmax...


In [13]:
# training, taken from mnist.py in lasagne examples

num_epochs = 1
batch_size = 5
val_batch_size = 5

for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_batches = 0
    start_time = time.time()
    
    for batch in iterate_minibatches(train, batch_size):
        
        inputs, targets, mask = batch
        batch_err = train_fn(inputs, targets, mask)

        train_err += batch_err
        train_batches += 1
        
        if not train_batches % 10:
            print "Done {} batches in {:.2f} sec.    training loss:\t\t{}".format(
                train_batches, time.time() - start_time, train_err / train_batches)

    # And a full pass over the validation data:
    val_err = 0
    val_batches = 0
    start_time2 = time.time()
    
    clone_param_values(net_from=net, net_to=test_net)
    
    for batch in iterate_minibatches(valid, val_batch_size):
        inputs, targets, mask = batch
        
        err = val_fn(inputs, targets, mask)
        val_err += err
        val_batches += 1
        if not val_batches % 100:
            print "Done {} batches in {:.2f} sec.".format(
                val_batches, time.time() - start_time2)

    # Then we print the results for this epoch:
    print "Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time)
    print "  training loss:\t\t{:.6f}".format(train_err / train_batches)
    print "  validation loss:\t\t{:.6f}".format(val_err / val_batches)
    #print "  validation accuracy:\t\t{:.2f} %".format(
    #    val_acc / val_batches * 100)
    
# np.savez('test_1ep_params_20_sampled_unique_bs50.npz', *L.layers.get_all_param_values(net))

Done 10 batches in 1.51 sec.    training loss:		8.43167209625
Done 20 batches in 2.83 sec.    training loss:		7.2098290205
Done 30 batches in 4.47 sec.    training loss:		6.80911650658
Done 40 batches in 5.73 sec.    training loss:		6.51092954874
Done 50 batches in 7.50 sec.    training loss:		6.4106634903
Done 60 batches in 9.09 sec.    training loss:		6.26619983514
Done 70 batches in 10.56 sec.    training loss:		6.16145815168
Done 80 batches in 11.94 sec.    training loss:		6.09250831604
Done 90 batches in 13.37 sec.    training loss:		6.03046529558
Done 100 batches in 14.76 sec.    training loss:		5.97374654293
Done 110 batches in 16.47 sec.    training loss:		5.9317473585
Done 120 batches in 18.11 sec.    training loss:		5.89309691588
Done 130 batches in 19.50 sec.    training loss:		5.85920016582
Done 140 batches in 20.93 sec.    training loss:		5.82559099197
Done 150 batches in 22.43 sec.    training loss:		5.79228741646
Done 160 batches in 24.24 sec.    training loss:		5.774399

KeyboardInterrupt: 

In [None]:
np.savez('test_1ep_params_20_sampled_unique_bs50.npz', *L.layers.get_all_param_values(net))

In [29]:
with np.load('test_1ep_params.npz') as f:
    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    L.layers.set_all_param_values(test_net, param_values)

In [10]:
np.savez('fsoft_trained_singleLSTM.npz', *L.layers.get_all_param_values(net))

In [33]:
input_var = T.imatrix('inputs')
gen_net = build_hsoft_rnnlm(input_var, None, None, voc_size, emb_size, rec_size)
probs = L.layers.get_output(gen_net)[:,-1,:]
get_probs = theano.function([input_var], probs)

In [11]:
input_var = T.imatrix('inputs')
gen_net = build_sampledsoft_rnnlm(input_var, None, -1, voc_size, emb_size, rec_size)
probs = L.layers.get_output(gen_net)[:,-1,:]
get_probs = theano.function([input_var], probs)

In [11]:
input_var = T.imatrix('inputs')
gen_net = build_simple_rnnlm(input_var, None, voc_size, emb_size, rec_size)
probs = L.layers.get_output(gen_net)[:,-1,:]
get_probs = theano.function([input_var], probs)

In [13]:
with np.load('test_1ep_params_1000_sampled_unique_bs40.npz') as f:
    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    L.layers.set_all_param_values(gen_net, param_values)

In [13]:
def rnd_next_word(probs, size=1):
    return np.random.choice(np.arange(probs.shape[0], dtype=np.int32), size=size, p=probs)

def beam_search(get_probs_fun, beam=10, init_seq='', mode='rr'):
    utt = [1] + map(lambda w: w_to_idx.get(w, w_to_idx['<unk>']), init_seq.split())
    utt = np.asarray(utt, dtype=np.int32)[np.newaxis]
    
    if mode[0] == 's':
        words = get_probs_fun(utt)[0].argpartition(-beam)[-beam:].astype(np.int32)
    elif mode[0] == 'r':
        words = rnd_next_word(get_probs_fun(utt)[0], beam)
    
    candidates = utt.repeat(beam, axis=0)
    candidates = np.hstack([candidates, words[np.newaxis].T])
    scores = np.zeros(beam)
    
#     print candidates
    
    while 0 not in candidates[:,-1] and candidates.shape[1] < 100:
        
        if mode[1] == 's':
            log_probs = np.log(get_probs_fun(candidates))
            tot_scores = log_probs + scores[np.newaxis].T

            idx = tot_scores.ravel().argpartition(-beam)[-beam:]
            i,j = idx / tot_scores.shape[1], (idx % tot_scores.shape[1]).astype(np.int32)

            scores = tot_scores[i,j]

            candidates = np.hstack([candidates[i], j[np.newaxis].T])
            
        elif mode[1] == 'r':
            probs = get_probs_fun(candidates)
            words = []
            for k in xrange(beam):
                words.append(rnd_next_word(probs[k], beam)) # this doesn't have to be exactly 'beam'
            words = np.array(words)
            idx = np.indices((beam, words.shape[1]))[0]
            tot_scores = scores[np.newaxis].T + np.log(probs)[idx, words]
                
            idx = tot_scores.ravel().argpartition(-beam)[-beam:]
            i,j = idx / tot_scores.shape[1], (idx % tot_scores.shape[1])

            scores = tot_scores[i,j]

            candidates = np.hstack([candidates[i], words[i,j][np.newaxis].T])
            
#     print candidates[:,:10]
#     print scores[:10]
        
    cands = candidates[candidates[:,-1] == 0]
    if cands.size > 0:
        return candidates[candidates[:,-1] == 0][0]
    return candidates[scores.argmax()]

In [21]:
utt = beam_search(get_probs, init_seq='', beam=10, mode='rr')

text = map(lambda i: idx_to_w[i], list(utt))
' '.join(text[1:-1])

"well , i ' m not going to be . i ' m not going to be . i ' m not going to be here ."

In [17]:
def rnd_next_word(probs):
    return np.random.choice(np.arange(len(probs[0])), p=probs[0])

init_seq = ''
utt = [1] + map(lambda w: w_to_idx.get(w, w_to_idx['<unk>']), init_seq.split())
utt = np.asarray(utt, dtype=np.int32)[np.newaxis]

i = 0
while idx_to_w[utt[0,-1]] != '<utt_end>' and i < 50:
    word_probs = get_probs(utt)
    next_idx = rnd_next_word(word_probs)
    utt = np.append(utt, next_idx)[np.newaxis].astype(np.int32)
    i += 1
    
text = map(lambda i: idx_to_w[i], list(utt[0]))
' '.join(text[1:-1])

'and then far , you know ? <unk> , put that one little hot . i mean . then not !'