In [1]:
import numpy as np
import theano
import theano.tensor as T
import time

import lasagne as L

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 30 days


In [2]:
mt_path = "/pio/data/data/mtriples/"

beg_token = '<utt_beg>'
end_token = '<utt_end>'

def get_mt_voc(path=mt_path):
    i_to_w, w_to_i = {}, {}
    
    i_to_w[0] = end_token   # separate tokens for beginning and ending of an utterance
    w_to_i[end_token] = 0   # <utt_end> serves only as a target for the last word in the input sequence
    i_to_w[1] = beg_token   # <utt_beg> will always be the first generated word
    w_to_i[beg_token] = 1    
    wc = 2
    
    with open(path + "WordsList.txt", "r") as wl:
        for w in wl:
            i_to_w[wc] = w[:-1]
            w_to_i[w[:-1]] = wc
            wc += 1
    
    return i_to_w, w_to_i, wc


mt_batch_size = 50
mt_i_to_w, mt_w_to_i, mt_voc_size = get_mt_voc()

def load_mt(path=mt_path):
    tr = None
    vl = None
    ts = None
    
    with open(path + "Training_Shuffled_Dataset.txt") as f:
        tr = []
        for l in f:
            tr.insert(0, [1] + map(lambda w: mt_w_to_i.get(w, mt_w_to_i['<unk>']), l.split()) + [0])
        
    with open(path + "Validation_Shuffled_Dataset.txt") as f:
        vl = []
        for l in f:
            vl.insert(0, [1] + map(lambda w: mt_w_to_i.get(w, mt_w_to_i['<unk>']), l.split()) + [0])
            
    with open(path + "Test_Shuffled_Dataset.txt") as f:
        ts = []
        for l in f:
            ts.insert(0, [1] + map(lambda w: mt_w_to_i.get(w, mt_w_to_i['<unk>']), l.split()) + [0])
    
    return tr, vl, ts

mt_train, mt_val, mt_test = load_mt()

In [3]:
# Similar to Lasagne mnist.py example, added input mask and different sequence lengths

def iterate_minibatches(inputs, batchsize, shuffle=False):
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        t0 = time.time() # time wasted preparing data, just for the info
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        inp = inputs[excerpt]
        
        inp_max_len = len(max(inp, key=len))
        inp = map(lambda l: l + [-1]*(inp_max_len-len(l)), inp)
        inp = np.asarray(inp, dtype=np.int32)
        tar = np.hstack((inp[:,1:], np.asarray([-1]*batchsize, dtype=np.int32).reshape((-1,1))))
        def gr_zero(x):
            if x > 0:
                return 1.
            return 0.
        v_gr_zero = np.vectorize(gr_zero, otypes=[np.float32])
        mask = v_gr_zero(inp) # 0 in vocabulary represents <utt_end>, we don't feed that into the net
        
        yield inp, tar, mask, (time.time()-t0)

In [4]:
def build_simple_rnnlm(input_var, mask_input_var, voc_size, emb_size, rec_size):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)    
    batch_size, seq_len = l_in.input_var.shape
    l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    l_emb = L.layers.EmbeddingLayer(l_in,
                                    input_size=voc_size, 
                                    output_size=emb_size)
    
    l_rec = L.layers.RecurrentLayer(l_emb,
                                    num_units=rec_size, 
                                    W_in_to_hid=L.init.Orthogonal(), 
                                    W_hid_to_hid=L.init.Orthogonal(),
                                    mask_input=l_mask)
    
    l_resh = L.layers.ReshapeLayer(l_rec, shape=(-1, rec_size))
    
    l_soft = L.layers.DenseLayer(l_resh,
                                num_units=voc_size,
                                nonlinearity=L.nonlinearities.softmax)
    
    l_out = L.layers.ReshapeLayer(l_soft, shape=(batch_size, seq_len, voc_size))
    
    return l_out

In [9]:
voc_size = mt_voc_size
emb_size = 50
rec_size = 100

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')

net = build_simple_rnnlm(input_var, mask_input_var, voc_size, emb_size, rec_size)
out = L.layers.get_output(net)

loss = L.objectives.categorical_crossentropy(out.reshape((-1,voc_size)), targets.ravel())
loss = loss.mean() # mean batch loss

params = L.layers.get_all_params(net, trainable=True)
updates = L.updates.rmsprop(loss, params)

train_fn = theano.function([input_var, targets, mask_input_var], loss, updates=updates)

#### for validation

test_out = L.layers.get_output(net, deterministic=True)
test_loss = L.objectives.categorical_crossentropy(test_out.reshape((-1,voc_size)), targets.ravel())
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_out, axis=1), targets), dtype=theano.config.floatX)

val_fn = theano.function([input_var, targets, mask_input_var], [test_loss, test_acc])

In [10]:
# training, taken from mnist.py in lasagne examples

num_epochs = 50

for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_batches = 0
    start_time = time.time()
    
    time_wasted = 0
    training_time = 0
    
    for batch in iterate_minibatches(mt_train, mt_batch_size):
        
        inputs, targets, mask, t = batch
        
        batch_training_time = time.time()
        train_err += train_fn(inputs, targets, mask)
        training_time += time.time() - batch_training_time
        train_batches += 1
        
        time_wasted += t
        if not train_batches % 5:
            print("Done {} batches in {} sec. Time wasted: {} sec. Training time: {} sec.").format(
                train_batches, time.time() - start_time, time_wasted, training_time)

    # And a full pass over the validation data:
    val_err = 0
    val_acc = 0
    val_batches = 0
    for batch in iterate_minibatches(mt_val, mt_batch_size):
        inputs, targets, mask = batch
        err, acc = val_fn(inputs, targets, mask)
        val_err += err
        val_acc += acc
        val_batches += 1

    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time))
    print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
    print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
    print("  validation accuracy:\t\t{:.2f} %".format(
        val_acc / val_batches * 100))

Done 5 batches in 9.14288902283 sec. Time wasted: 0.00947022438049 sec. Training time: 9.13328886032 sec.
Done 10 batches in 18.2482941151 sec. Time wasted: 0.0194792747498 sec. Training time: 18.2285096645 sec.
Done 15 batches in 25.5649681091 sec. Time wasted: 0.0280480384827 sec. Training time: 25.536460638 sec.


KeyboardInterrupt: 