In [1]:
import numpy as np
import theano
import theano.tensor as T
import time
import sys

import lasagne as L

sys.path.insert(0, '../HSoftmaxLayerLasagne/')

from HSoftmaxLayer import HierarchicalSoftmaxDenseLayer
from SampledSoftmaxLayer import SampledSoftmaxDenseLayer

Using gpu device 0: GeForce GTX 780 (CNMeM is enabled with initial size: 30.0% of memory, cuDNN 4007)
  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [2]:
# remember, now the pad value is the same as the <utt_end> token

pad_value = -1 # <utt_end>'s vector is the last one

In [3]:
mt_path = "/pio/data/data/mtriples/"

def load_mt(path=mt_path):
    tr = np.load(mt_path + 'Training.triples.pkl')
    vl = np.load(mt_path + 'Validation.triples.pkl')
    ts = np.load(mt_path + 'Test.triples.pkl')
    
    return tr, vl, ts

train, valid, test = load_mt()

train = [utt for utt in train if len(utt) < 200]
valid = [utt for utt in valid if len(utt) < 200]
test  = [utt for utt in test  if len(utt) < 200]


def get_mt_voc(mt_path=mt_path, train_len=len(train)):
    word_list = np.load(mt_path + 'Training.dict.pkl')
    word_list.sort(key=lambda x: x[1])
    freqs = np.array(map(lambda x: x[2], word_list) + [train_len])
    total_count = float(sum(freqs))
    
    words = map(lambda x: x[:2], word_list)
    
    w_to_idx = dict(words)
    w_to_idx['<utt_end>'] = pad_value
    idx_to_w = {v : k for (k,v) in w_to_idx.items()}
    
    return idx_to_w, w_to_idx, len(w_to_idx), freqs / total_count

idx_to_w, w_to_idx, voc_size, freqs = get_mt_voc()

In [4]:
word2vec_embs, word2vec_embs_mask = np.load(mt_path + 'Word2Vec_WordEmb.pkl')
word2vec_embs = np.vstack([word2vec_embs, L.init.GlorotUniform()((1,300))]).astype(np.float32)
word2vec_embs_mask = np.vstack([word2vec_embs_mask, np.ones((1,300))])

w2v_train_mask = np.where(word2vec_embs_mask[:,0] == 1)[0]

In [5]:
# Similar to Lasagne mnist.py example, added input mask and different sequence lengths

def iterate_minibatches(inputs, batchsize, pad=pad_value):
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):        
        excerpt = slice(start_idx, start_idx + batchsize)
        inp = inputs[excerpt]
        
        inp_max_len = len(max(inp, key=len))
        inp = map(lambda l: l + [pad]*(inp_max_len-len(l)), inp)
        inp = np.asarray(inp, dtype=np.int32)
        tar = np.hstack([inp[:,1:], np.zeros((batchsize,1), dtype=np.int32) + pad])
        def not_pad(x):
            return x != pad
        v_not_pad = np.vectorize(not_pad, otypes=[np.float32])
        mask = v_not_pad(inp) # there is no separate value for the end of an utterance right now, just pad
        
        yield inp, tar, mask

In [6]:
def build_simple_rnnlm(input_var, mask_input_var, voc_size, emb_size, rec_size, 
                       emb_init=None, train_emb=True):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)  
    batch_size, seq_len = l_in.input_var.shape
    
    l_mask = None
    if mask_input_var is not None:
        print 'setting up input mask...'
        l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    if emb_init is None:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size, # not voc_size+1, because pad_value = <utt_end>
                                        output_size=emb_size)
    else:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size, 
                                        output_size=emb_size,
                                        W=emb_init)
        if not train_emb:
            l_emb.params[l_emb.W].remove('trainable')
    
    l_lstm1 = L.layers.LSTMLayer(l_emb,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
    
    l_lstm2 = L.layers.LSTMLayer(l_lstm1,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
    
    l_resh = L.layers.ReshapeLayer(l_lstm2, shape=(-1, rec_size))
    
    l_soft = L.layers.DenseLayer(l_resh,
                                 num_units=voc_size,
                                 nonlinearity=L.nonlinearities.softmax)
    
    l_out = L.layers.ReshapeLayer(l_soft, shape=(batch_size, seq_len, voc_size))
    
    return l_out

In [7]:
def build_hsoft_rnnlm(input_var, target_var, mask_input_var, voc_size, emb_size, rec_size):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)    
    batch_size, seq_len = l_in.input_var.shape
    l_mask = None
    if mask_input_var is not None:
        print 'setting up input mask...'
        l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    l_emb = L.layers.EmbeddingLayer(l_in,
                                    input_size=voc_size+1, 
                                    output_size=emb_size)
    
    l_lstm1 = L.layers.LSTMLayer(l_emb,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)    
    
#     l_lstm2 = L.layers.LSTMLayer(l_lstm1,
#                                  num_units=rec_size,
#                                  nonlinearity=L.nonlinearities.tanh,
#                                  grad_clipping=100,
#                                  mask_input=l_mask)
    
    l_resh = L.layers.ReshapeLayer(l_lstm1, shape=(-1, rec_size))
    
    # hierarchical softmax
    
    l_resh_tar = None
    if target_var is not None:
        print 'setting up targets for hsoftmax...'
        l_tar = L.layers.InputLayer(shape=(None, None), input_var=target_var)
        l_resh_tar = L.layers.ReshapeLayer(l_tar, shape=(-1, 1))
        
    l_hsoft = HierarchicalSoftmaxDenseLayer(l_resh,
                                            num_units=voc_size,
                                            target=l_resh_tar)
    l_out = None
    if target_var is not None:
        l_out = L.layers.ReshapeLayer(l_hsoft, shape=(batch_size, seq_len))
    else:
        l_out = L.layers.ReshapeLayer(l_hsoft, shape=(batch_size, seq_len, voc_size))
    
    return l_out

# 1 epoch on gpu with hsoft took about 700s, batch_size = 50

In [8]:
def build_sampledsoft_rnnlm(input_var, mask_input_var, num_sampled, voc_size, 
                            emb_size, rec_size, target_var=None, emb_init=None, 
                            train_emb=True, ssoft_probs=None, sample_unique=False):
    l_in = L.layers.InputLayer(shape=(None, None), input_var=input_var)    
    batch_size, seq_len = l_in.input_var.shape
    l_mask = None
    if mask_input_var != None:
        print 'setting up input mask...'
        l_mask = L.layers.InputLayer(shape=(batch_size, seq_len), input_var=mask_input_var)
    
    if emb_init is None:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size, # not voc_size+1, because pad_value = <utt_end>
                                        output_size=emb_size)
    else:
        l_emb = L.layers.EmbeddingLayer(l_in,
                                        input_size=voc_size, 
                                        output_size=emb_size,
                                        W=emb_init)
        if not train_emb:
            l_emb.params[l_emb.W].remove('trainable')
    
    l_lstm1 = L.layers.LSTMLayer(l_emb,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
    
    l_lstm2 = L.layers.LSTMLayer(l_lstm1,
                                 num_units=rec_size,
                                 nonlinearity=L.nonlinearities.tanh,
                                 grad_clipping=100,
                                 mask_input=l_mask)
      
    l_resh = L.layers.ReshapeLayer(l_lstm2, shape=(-1, rec_size))
    
    if target_var is not None:
        print 'setting up targets for sampled softmax...'
        target_var = target_var.ravel()
    
    l_ssoft = SampledSoftmaxDenseLayer(l_resh, num_sampled, voc_size, 
                                       targets=target_var,
                                       probs=ssoft_probs,
                                       sample_unique=sample_unique)
    
    if target_var is not None:
        l_out = L.layers.ReshapeLayer(l_ssoft, shape=(batch_size, seq_len))
    else:
        l_out = L.layers.ReshapeLayer(l_ssoft, shape=(batch_size, seq_len, voc_size))
    
    return l_out

In [9]:
emb_size = 300
rec_size = 300

In [10]:
def clone_param_values(net_from, net_to):
    L.layers.set_all_param_values(net_to, L.layers.get_all_param_values(net_from))

In [51]:
# full softmax test

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')

net = build_simple_rnnlm(input_var, mask_input_var, voc_size, emb_size, rec_size, 
                         emb_init=word2vec_embs, train_emb=False)
out = L.layers.get_output(net)

mask_idx = mask_input_var.nonzero()
loss = L.objectives.categorical_crossentropy(out[mask_idx], targets[mask_idx])
loss = loss.mean() # mean batch loss

params = L.layers.get_all_params(net, trainable=True)
updates = L.updates.adagrad(loss, params, learning_rate=.01)

# update modification to train only randomly initialized embeddings
# updates[params[0]] = T.set_subtensor(params[0][w2v_train_mask], updates[params[0]][w2v_train_mask])

train_fn = theano.function([input_var, targets, mask_input_var], loss, updates=updates)

### for validation

test_out = L.layers.get_output(net, deterministic=True)
test_loss = L.objectives.categorical_crossentropy(test_out[mask_idx], targets[mask_idx])
test_loss = test_loss.mean()
# test_acc = T.mean(T.eq(T.argmax(test_out, axis=1), targets), dtype=theano.config.floatX)

val_fn = theano.function([input_var, targets, mask_input_var], test_loss)

setting up input mask...


In [11]:
# sampled softmax test (with targets!)

num_sampled = 200

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')

net = build_sampledsoft_rnnlm(input_var, mask_input_var, num_sampled, voc_size, 
                              emb_size, rec_size, target_var=targets, emb_init=word2vec_embs,
                              ssoft_probs=freqs)
out = L.layers.get_output(net)

mask_idx = mask_input_var.nonzero()
loss = -T.sum(T.log(out[mask_idx])) / T.sum(mask_input_var)

params = L.layers.get_all_params(net, trainable=True)
updates = L.updates.adagrad(loss, params, learning_rate=.01)
# updates = L.updates.rmsprop(loss, params, learning_rate=.001, rho=.9, epsilon=1e-06)

train_fn = theano.function([input_var, targets, mask_input_var], loss, updates=updates)

### for validation

test_out = L.layers.get_output(net, deterministic=True, use_all_words=True)
test_loss = -T.sum(T.log(test_out[mask_idx])) / T.sum(mask_input_var)

val_fn = theano.function([input_var, targets, mask_input_var], test_loss)

setting up input mask...
setting up targets for sampled softmax...


In [20]:
# hierarchical softmax test

input_var = T.imatrix('inputs')
targets = T.imatrix('targets') # these will be inputs shifted by 1
mask_input_var = T.matrix('input_mask')

net = build_hsoft_rnnlm(input_var, targets, mask_input_var, voc_size, emb_size, rec_size)
out = L.layers.get_output(net)

mask_idx = mask_input_var.nonzero()
loss = -T.sum(T.log(out[mask_idx])) / T.sum(mask_input_var)

params = L.layers.get_all_params(net, trainable=True)
#updates = L.updates.rmsprop(loss, params, learning_rate=.001, rho=.9, epsilon=1e-06)
updates = L.updates.adagrad(loss, params, learning_rate=.01)

train_fn = theano.function([input_var, targets, mask_input_var], loss, updates=updates)

#### for validation

test_out = L.layers.get_output(net, deterministic=True)
test_loss = -T.sum(T.log(test_out[mask_idx])) / T.sum(mask_input_var)

#test_acc = T.mean(T.eq(T.argmax(test_out, axis=1), targets), dtype=theano.config.floatX)

val_fn = theano.function([input_var, targets, mask_input_var], test_loss)

setting up input mask...
setting up targets for hsoftmax...
setting up input mask...
setting up targets for hsoftmax...


In [12]:
# training, taken from mnist.py in lasagne examples

num_epochs = 5
batch_size = 50
val_batch_size = 25

for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_batches = 0
    start_time = time.time()
    
    for batch in iterate_minibatches(train, batch_size):
        
        inputs, targets, mask = batch
        batch_err = train_fn(inputs, targets, mask)

        train_err += batch_err
        train_batches += 1
        
        if not train_batches % 10:
            print "Done {} batches in {:.2f} sec.    training loss:\t\t{}".format(
                train_batches, time.time() - start_time, train_err / train_batches)

    # And a full pass over the validation data:
    val_err = 0
    val_batches = 0
    start_time2 = time.time()
    
    for batch in iterate_minibatches(valid, val_batch_size):
        inputs, targets, mask = batch
        
        err = val_fn(inputs, targets, mask)
        val_err += err
        val_batches += 1
        if not val_batches % 100:
            print "Done {} batches in {:.2f} sec.".format(
                val_batches, time.time() - start_time2)

    # Then we print the results for this epoch:
    print "Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time)
    print "  training loss:\t\t{:.6f}".format(train_err / train_batches)
    print "  validation loss:\t\t{:.6f}".format(val_err / val_batches)
    
np.savez('5ep_w2vInit_300_300_ssoft(uni,200,non-unique)_bs50_cut200.npz', *L.layers.get_all_param_values(net))

Done 10 batches in 2.45 sec.    training loss:		7.44353927834
Done 20 batches in 4.93 sec.    training loss:		6.66934120304
Done 30 batches in 7.24 sec.    training loss:		6.36463218175
Done 40 batches in 9.64 sec.    training loss:		6.17626197709
Done 50 batches in 12.34 sec.    training loss:		6.04152103334
Done 60 batches in 14.69 sec.    training loss:		5.9128881931
Done 70 batches in 17.10 sec.    training loss:		5.82142742549
Done 80 batches in 19.44 sec.    training loss:		5.73142549026
Done 90 batches in 22.07 sec.    training loss:		5.64433819338
Done 100 batches in 24.37 sec.    training loss:		5.56514480491
Done 110 batches in 26.87 sec.    training loss:		5.49900743206
Done 120 batches in 29.45 sec.    training loss:		5.43617925437
Done 130 batches in 31.88 sec.    training loss:		5.37734211024
Done 140 batches in 34.38 sec.    training loss:		5.32326670707
Done 150 batches in 36.63 sec.    training loss:		5.27438875182
Done 160 batches in 38.97 sec.    training loss:		5.22

In [None]:
# to byl test: non-unique, minusQ, unigram, num_sampled=200

In [54]:
# np.savez('5ep_w2vInit_300_300_ssoft(uni,200,non-unique)_bs50_cut200.npz', *L.layers.get_all_param_values(net))

In [49]:
with np.load('1ep_w2vInit_300_300_fullsoft_bs25_cut200.npz') as f:
    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    L.layers.set_all_param_values(test_net, param_values)

In [33]:
input_var = T.imatrix('inputs')
gen_net = build_hsoft_rnnlm(input_var, None, None, voc_size, emb_size, rec_size)
probs = L.layers.get_output(gen_net)[:,-1,:]
get_probs = theano.function([input_var], probs)

In [13]:
input_var = T.imatrix('inputs')
gen_net = build_sampledsoft_rnnlm(input_var, None, -1, voc_size, emb_size, rec_size)
probs = L.layers.get_output(gen_net)[:,-1,:]
get_probs = theano.function([input_var], probs)

In [43]:
input_var = T.imatrix('inputs')
gen_net = build_simple_rnnlm(input_var, None, voc_size, emb_size, rec_size)
probs = L.layers.get_output(gen_net)[:,-1,:]
get_probs = theano.function([input_var], probs)

In [14]:
clone_param_values(net_from=net, net_to=gen_net)

In [12]:
with np.load('1ep_w2vPseudoFixed_300_300_fullsoft_bs25_cut200.npz') as f:
    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    L.layers.set_all_param_values(gen_net, param_values)

In [107]:
def rnd_next_word(probs, size=1):
    return np.random.choice(np.append(np.arange(probs.shape[0]-1), -1).astype(np.int32), 
                            size=size, p=probs)

def beam_search(get_probs_fun, beam=10, init_seq='', mode='rr'):
    utt = map(lambda w: w_to_idx.get(w, w_to_idx['<unk>']), init_seq.split())
    if len(utt) == 0 or utt[0] != 1:
        utt = [1] + utt
    utt = np.asarray(utt, dtype=np.int32)[np.newaxis]
    
    if mode[0] == 's':
        words = get_probs_fun(utt)[0].argpartition(-beam)[-beam:].astype(np.int32)
        words[words==voc_size-1] = pad_value
    elif mode[0] == 'r':
        words = rnd_next_word(get_probs_fun(utt)[0], beam)
    
    candidates = utt.repeat(beam, axis=0)
    candidates = np.hstack([candidates, words[np.newaxis].T])
    scores = np.zeros(beam)
    
#     print candidates
    
    while candidates.shape[1] < 100 and pad_value not in candidates[:,-1]:
        
        if mode[1] == 's':
            log_probs = np.log(get_probs_fun(candidates))
            tot_scores = log_probs + scores[np.newaxis].T

            idx = tot_scores.ravel().argpartition(-beam)[-beam:]
            i,j = divmod(idx, tot_scores.shape[1])
            j[j==voc_size-1] = pad_value
            
            scores = tot_scores[i,j]

            candidates = np.hstack([candidates[i], j[np.newaxis].T.astype(np.int32)])
            
        elif mode[1] == 'r':
            print L.layers.get_all_param_values(gen_net)[16]
            probs = get_probs_fun(candidates)
            words = []
            for k in xrange(beam):
                words.append(rnd_next_word(probs[k], beam)) # this doesn't have to be exactly 'beam'
            words = np.array(words)
            idx = np.indices((beam, words.shape[1]))[0]
            tot_scores = scores[np.newaxis].T + np.log(probs)[idx, words]
                
            idx = tot_scores.ravel().argpartition(-beam)[-beam:]
            i,j = divmod(idx, tot_scores.shape[1])

            scores = tot_scores[i,j]

            candidates = np.hstack([candidates[i], words[i,j][np.newaxis].T])
            
#     print candidates[:,:10]
#     print scores[:10]
        
    cands = candidates[candidates[:,-1] == 0]
    if cands.size > 0:
        return candidates[candidates[:,-1] == 0][0]
    return candidates[scores.argmax()]

In [88]:
init_seq = ''
utt = [1] + map(lambda w: w_to_idx.get(w, w_to_idx['<unk>']), init_seq.split())
utt = np.asarray(utt, dtype=np.int32)[np.newaxis]

i = 0
while utt[0,-1] != -1 and i < 100:
    word_probs = get_probs(utt)[0]
    next_idx = rnd_next_word(word_probs)
    utt = np.append(utt, next_idx)[np.newaxis].astype(np.int32)
    i += 1
    
text = map(lambda i: idx_to_w[i], list(utt[0]))
' '.join([t for t in text if t not in ['<s>', '</s>', '<utt_end>']])

"mine is court , it ' s nothing . but i ' m not a full . and you know it really . let me tell you something . why not ?"