In [1]:
import numpy as np
import theano
import theano.tensor as T
import time
import sys

import lasagne as L

sys.path.insert(0, '../HSoftmaxLayerLasagne/')

from HSoftmaxLayer import HierarchicalSoftmaxDenseLayer
from SampledSoftmaxLayer import SampledSoftmaxDenseLayer

Using gpu device 0: GeForce GTX 780 (CNMeM is enabled with initial size: 30.0% of memory, cuDNN 4007)
  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [2]:
mt_path = "/pio/data/data/mtriples/"

beg_token = '<utt_beg>'
end_token = '<utt_end>'

def get_mt_voc(path=mt_path):
    i_to_w, w_to_i = {}, {}
    
    i_to_w[0] = end_token   # separate tokens for beginning and ending of an utterance
    w_to_i[end_token] = 0   # <utt_end> serves only as a target for the last word in the input sequence
    i_to_w[1] = beg_token   # <utt_beg> will always be the first generated word
    w_to_i[beg_token] = 1    
    wc = 2
    
    with open(path + "WordsList.txt", "r") as wl:
        for w in wl:
            i_to_w[wc] = w[:-1]
            w_to_i[w[:-1]] = wc
            wc += 1
    
    return i_to_w, w_to_i, wc

mt_i_to_w, mt_w_to_i, mt_voc_size = get_mt_voc()


def load_mt(path=mt_path):
    tr = None
    vl = None
    ts = None
    
    with open(path + "Training_Shuffled_Dataset.txt") as f:
        tr = []
        for l in f:
            if len(l.split()) < 1000:
                tr.insert(0, [1] + map(lambda w: mt_w_to_i.get(w, mt_w_to_i['<unk>']), l.split()) + [0])
        
    with open(path + "Validation_Shuffled_Dataset.txt") as f:
        vl = []
        for l in f:
            if len(l.split()) < 1000:
                vl.insert(0, [1] + map(lambda w: mt_w_to_i.get(w, mt_w_to_i['<unk>']), l.split()) + [0])
            
    with open(path + "Test_Shuffled_Dataset.txt") as f:
        ts = []
        for l in f:
            if len(l.split()) < 1000:
                ts.insert(0, [1] + map(lambda w: mt_w_to_i.get(w, mt_w_to_i['<unk>']), l.split()) + [0])
    
    return tr, vl, ts

mt_train, mt_val, mt_test = load_mt()

In [3]:
# Similar to Lasagne mnist.py example, added input mask and different sequence lengths

def iterate_minibatches(inputs, batchsize):
    pad_value = -1
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        t0 = time.time() # time wasted preparing data, just for the info
        
        excerpt = slice(start_idx, start_idx + batchsize)
        inp = inputs[excerpt]
        
        inp_max_len = len(max(inp, key=len))
        inp = map(lambda l: l + [pad_value]*(inp_max_len-len(l)), inp)
        inp = np.asarray(inp, dtype=np.int32)
        tar = np.hstack((inp[:,1:], np.asarray([pad_value]*batchsize, dtype=np.int32).reshape((-1,1))))
        def gr_zero(x):
            if x > 0:
                return 1.
            return 0.
        v_gr_zero = np.vectorize(gr_zero, otypes=[np.float32])
        mask = v_gr_zero(inp) # 0 in vocabulary represents <utt_end>, we don't feed that into the net
        
        yield inp, tar, mask, (time.time()-t0)

In [4]:
def build_simple_rnnlm(input_var, mask_input_var, voc_size, emb_size, rec_size):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)    
    batch_size, seq_len = l_in.input_var.shape
    l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    l_emb = L.layers.EmbeddingLayer(l_in,
                                    input_size=voc_size+1, 
                                    output_size=emb_size)
    
    l_lstm1 = L.layers.LSTMLayer(l_emb,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
    
    l_resh = L.layers.ReshapeLayer(l_lstm1, shape=(-1, rec_size))
    
    l_soft = L.layers.DenseLayer(l_resh,
                                num_units=voc_size,
                                nonlinearity=L.nonlinearities.softmax)
    
    l_out = L.layers.ReshapeLayer(l_soft, shape=(batch_size, seq_len, voc_size))
    
    return l_out

In [5]:
def build_hsoft_rnnlm(input_var, target_var, mask_input_var, voc_size, emb_size, rec_size):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)    
    batch_size, seq_len = l_in.input_var.shape
    l_mask = None
    if mask_input_var != None:
        print 'setting up input mask...'
        l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    l_emb = L.layers.EmbeddingLayer(l_in,
                                    input_size=voc_size+1, 
                                    output_size=emb_size)
    
    l_lstm1 = L.layers.LSTMLayer(l_emb,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)    
    
    l_resh = L.layers.ReshapeLayer(l_lstm1, shape=(-1, rec_size))
    
    # hierarchical softmax
    
    l_resh_tar = None
    if target_var != None:
        print 'setting up targets for hsoftmax...'
        l_tar = L.layers.InputLayer(shape=(None, None), input_var=target_var)
        l_resh_tar = L.layers.ReshapeLayer(l_tar, shape=(-1, 1))
        
    l_hsoft = HierarchicalSoftmaxDenseLayer(l_resh,
                                                          num_units=voc_size,
                                                          target=l_resh_tar)
    l_out = None
    if target_var != None:
        l_out = L.layers.ReshapeLayer(l_hsoft, shape=(batch_size, seq_len))
    else:
        l_out = L.layers.ReshapeLayer(l_hsoft, shape=(batch_size, seq_len, voc_size))
    
    return l_out

# 1 epoch on gpu with hsoft took about 700s, batch_size = 50

In [27]:
def build_sampledsoft_rnnlm(input_var, mask_input_var, voc_mask, voc_size, emb_size, rec_size):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)    
    batch_size, seq_len = l_in.input_var.shape
    l_mask = None
    if mask_input_var != None:
        print 'setting up input mask...'
        l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    l_emb = L.layers.EmbeddingLayer(l_in,
                                    input_size=voc_size+1, 
                                    output_size=emb_size)
    
    l_lstm1 = L.layers.LSTMLayer(l_emb,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
    
    l_lstm2 = L.layers.LSTMLayer(l_lstm1,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
      
    l_resh = L.layers.ReshapeLayer(l_lstm2, shape=(-1, rec_size))
    
    l_ssoft = SampledSoftmaxDenseLayer(l_resh, voc_mask, voc_size)
    
    l_out_final = L.layers.ReshapeLayer(l_ssoft, shape=(batch_size, seq_len, voc_size))
    
    return l_out_final

In [7]:
voc_size = mt_voc_size
emb_size = 100
rec_size = 300

In [28]:
# sampled softmax test

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')
voc_mask = T.vector('voc_mask')

net = build_sampledsoft_rnnlm(input_var, mask_input_var, voc_mask, voc_size, emb_size, rec_size)
out = L.layers.get_output(net)

mask_idx = mask_input_var.nonzero()
loss = L.objectives.categorical_crossentropy(out[mask_idx], targets[mask_idx])
loss = loss.mean() # mean batch loss

params = L.layers.get_all_params(net, trainable=True)
updates = L.updates.adagrad(loss, params, learning_rate=.01)

train_fn = theano.function([input_var, targets, mask_input_var, voc_mask], loss, updates=updates)

### for validation

test_out = L.layers.get_output(net, deterministic=True)
test_loss = L.objectives.categorical_crossentropy(test_out[mask_idx], targets[mask_idx])
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_out, axis=1), targets), dtype=theano.config.floatX)

val_fn = theano.function([input_var, targets, mask_input_var, voc_mask], [test_loss, test_acc])

setting up input mask...


In [8]:
# full softmax test

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')

net = build_simple_rnnlm(input_var, mask_input_var, voc_size, emb_size, rec_size)
out = L.layers.get_output(net)

mask_idx = mask_input_var.nonzero()
loss = L.objectives.categorical_crossentropy(out[mask_idx], targets[mask_idx])
loss = loss.mean() # mean batch loss

params = L.layers.get_all_params(net, trainable=True)
updates = L.updates.adagrad(loss, params, learning_rate=.01)

train_fn = theano.function([input_var, targets, mask_input_var], loss, updates=updates)

### for validation

test_out = L.layers.get_output(net, deterministic=True)
test_loss = L.objectives.categorical_crossentropy(test_out[mask_idx], targets[mask_idx])
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_out, axis=1), targets), dtype=theano.config.floatX)

val_fn = theano.function([input_var, targets, mask_input_var], [test_loss, test_acc])

In [12]:
# hierarchical softmax test

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')

net = build_hsoft_rnnlm(input_var, targets, mask_input_var, voc_size, emb_size, rec_size)
out = L.layers.get_output(net)

mask_idx = mask_input_var.nonzero()
loss = -T.sum(T.log(out[mask_idx])) / T.sum(mask_input_var)

params = L.layers.get_all_params(net, trainable=True)
#updates = L.updates.rmsprop(loss, params, learning_rate=.001, rho=.9, epsilon=1e-06)
updates = L.updates.adagrad(loss, params, learning_rate=.01)

train_fn = theano.function([input_var, targets, mask_input_var], loss, updates=updates)

#### for validation

test_out = L.layers.get_output(net, deterministic=True)
test_loss = -T.sum(T.log(test_out[mask_idx])) / T.sum(mask_input_var)

#test_acc = T.mean(T.eq(T.argmax(test_out, axis=1), targets), dtype=theano.config.floatX)

val_fn = theano.function([input_var, targets, mask_input_var], test_loss)

### dump weights

# np.savez('model.npz', *L.layers.get_all_param_values(net))

setting up input mask...
setting up targets for hsoftmax...


In [12]:
# with np.load('model_trained.npz') as f:
#     param_values = [f['arr_%d' % i] for i in range(len(f.files))]
#     L.layers.set_all_param_values(net, param_values)

In [29]:
# training, taken from mnist.py in lasagne examples

num_epochs = 5
mt_batch_size = 25

for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_batches = 0
    start_time = time.time()
    
    time_wasted = 0
    training_time = 0
    
    for batch in iterate_minibatches(mt_train, mt_batch_size):
        
        inputs, targets, mask, t = batch
        
        inp_u = np.unique(inputs)
        if inp_u[0] == -1:
            inp_u = inp_u[1:]
        voc_mask = np.zeros((voc_size,), dtype=np.float32)
        voc_mask[inp_u] = 1
        
        batch_training_time = time.time()
        batch_err = train_fn(inputs, targets, mask, voc_mask)
#         batch_err = train_fn(inputs, targets, mask)

        train_err += batch_err
        training_time += time.time() - batch_training_time
        train_batches += 1
        
        time_wasted += t
        if not train_batches % 10:
            print "Done {} batches in {:.2f} sec.    training loss:\t\t{}".format(
                train_batches, time.time() - start_time, train_err / train_batches)

    # And a full pass over the validation data:
    val_err = 0
    val_batches = 0
    for batch in iterate_minibatches(mt_val, mt_batch_size):
        inputs, targets, mask, t = batch
        err = val_fn(inputs, targets, mask)
        val_err += err
        val_batches += 1

    # Then we print the results for this epoch:
    print "Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time)
    print "  training loss:\t\t{:.6f}".format(train_err / train_batches)
    print "  validation loss:\t\t{:.6f}".format(val_err / val_batches)
    #print "  validation accuracy:\t\t{:.2f} %".format(
    #    val_acc / val_batches * 100)

Done 10 batches in 4.08 sec.    training loss:		5.42535262108
Done 20 batches in 8.16 sec.    training loss:		5.24459784031
Done 30 batches in 11.73 sec.    training loss:		5.18125980695
Done 40 batches in 15.66 sec.    training loss:		5.13404990435
Done 50 batches in 19.02 sec.    training loss:		5.09522445679
Done 60 batches in 23.27 sec.    training loss:		5.07574270566
Done 70 batches in 27.35 sec.    training loss:		5.06155565126
Done 80 batches in 31.64 sec.    training loss:		5.05331097841
Done 90 batches in 36.14 sec.    training loss:		5.04104766316


KeyboardInterrupt: 

In [14]:
np.savez('model_trained_singleLSTM.npz', *L.layers.get_all_param_values(net))

In [20]:
input_var = T.imatrix('inputs')
gen_net = build_hsoft_rnnlm(input_var, None, None, voc_size, emb_size, rec_size)
probs = L.layers.get_output(gen_net)[:,-1,:]
get_probs = theano.function([input_var], probs)

In [21]:
with np.load('model_trained_singleLSTM.npz') as f:
    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    L.layers.set_all_param_values(gen_net, param_values)

In [23]:
def rnd_next_word(probs):
    return np.random.choice(np.arange(len(probs[0])), p=probs[0])

init_seq = ''
utt = [1] + map(lambda w: mt_w_to_i.get(w, mt_w_to_i['<unk>']), init_seq.split())
utt = np.asarray(utt, dtype=np.int32)[np.newaxis]

i = 0
while mt_i_to_w[utt[0,-1]] != '<utt_end>' and i < 50:
    word_probs = get_probs(utt)
    next_idx = rnd_next_word(word_probs)
    utt = np.append(utt, next_idx)[np.newaxis].astype(np.int32)
    i += 1
    
text = map(lambda i: mt_i_to_w[i], list(utt[0]))
' '.join(text[1:-1])

"i never had one because i ate it -- this isn ' t me ? i don ' t know ."