In [1]:
import numpy as np
import theano
import theano.tensor as T
import time
from itertools import chain

import lasagne as L

from SimpleRNNLM import SimpleRNNLM, iterate_minibatches
from mt_load import load_mt, get_mt_voc, get_w2v_embs

Using gpu device 0: GeForce GTX 970 (CNMeM is enabled with initial size: 30.0% of memory, cuDNN 5005)


In [2]:
# remember, now the pad value is the same as the <utt_end> token

pad_value = -1 # <utt_end>'s vector is the last one

In [3]:
mt_path = "/pio/data/data/mtriples/"
mt_path = "/home/maciek/Desktop/mgr/DATA/MovieTriples_Dataset/"

train, valid, test = load_mt(path=mt_path, split=False, trim=200)

idx_to_w, w_to_idx, voc_size, freqs = get_mt_voc(path=mt_path, train_len=len(train))

In [4]:
word2vec_embs, word2vec_embs_mask = get_w2v_embs(mt_path)
w2v_train_mask = np.where(word2vec_embs_mask[:,0] == 1)[0]

In [5]:
def update_fn(loss, params):
    return L.updates.adagrad(loss, params, learning_rate=.01)

net = SimpleRNNLM(voc_size=voc_size,
                  emb_size=300,
                  rec_size=300,
                  mode='ssoft',
                  num_sampled=200,
                  emb_init=word2vec_embs,
                  ssoft_probs=freqs)

Building the model...
Compiling theano functions...
Building a network for generation...
Done


In [6]:
net.train_model(num_epochs=1,
                path=None,
                save_params=False,
                train_batch_size=50,
                train_data=train,
                val_batch_size=25,
                val_data=valid)

Done 10 batches in 2.39s	training loss:	7.032292
Done 20 batches in 4.65s	training loss:	6.403765
Done 30 batches in 6.76s	training loss:	6.178933
Done 40 batches in 8.95s	training loss:	5.998504
Done 50 batches in 11.42s	training loss:	5.856934
Done 60 batches in 13.57s	training loss:	5.741491
Done 70 batches in 15.78s	training loss:	5.646826
Done 80 batches in 17.95s	training loss:	5.562671
Done 90 batches in 20.47s	training loss:	5.482072
Done 100 batches in 22.62s	training loss:	5.412747
Done 110 batches in 24.92s	training loss:	5.348361
Done 120 batches in 27.33s	training loss:	5.295554
Done 130 batches in 29.56s	training loss:	5.243523
Done 140 batches in 31.85s	training loss:	5.197713
Done 150 batches in 33.92s	training loss:	5.153456
Done 160 batches in 36.07s	training loss:	5.114550
Done 170 batches in 38.24s	training loss:	5.075772
Done 180 batches in 40.52s	training loss:	5.039840
Done 190 batches in 42.70s	training loss:	5.006673
Done 200 batches in 45.29s	training loss:	4.

In [39]:
def rnd_next_word(probs, size=1):
    return np.random.choice(np.append(np.arange(probs.shape[0]-1), -1).astype(np.int32), 
                            size=size, p=probs)

def beam_search(get_probs_fun, beam=10, init_seq='', mode='rr'):
    utt = map(lambda w: w_to_idx.get(w, w_to_idx['<unk>']), init_seq.split())
    if len(utt) == 0 or utt[0] != 1:
        utt = [1] + utt
    utt = np.asarray(utt, dtype=np.int32)[np.newaxis]
    
    if mode[0] == 's':
        words = get_probs_fun(utt)[0].argpartition(-beam)[-beam:].astype(np.int32)
        words[words==voc_size-1] = pad_value
    elif mode[0] == 'r':
        words = rnd_next_word(get_probs_fun(utt)[0], beam)
    
    candidates = utt.repeat(beam, axis=0)
    candidates = np.hstack([candidates, words[np.newaxis].T])
    scores = np.zeros(beam)
    
#     print candidates
    
    while candidates.shape[1] < 100 and pad_value not in candidates[:,-1]:
        
        if mode[1] == 's':
            log_probs = np.log(get_probs_fun(candidates))
            tot_scores = log_probs + scores[np.newaxis].T

            idx = tot_scores.ravel().argpartition(-beam)[-beam:]
            i,j = divmod(idx, tot_scores.shape[1])
            j[j==voc_size-1] = pad_value
            
            scores = tot_scores[i,j]

            candidates = np.hstack([candidates[i], j[np.newaxis].T.astype(np.int32)])
            
        elif mode[1] == 'r':
            probs = get_probs_fun(candidates)
            words = []
            for k in xrange(beam):
                words.append(rnd_next_word(probs[k], beam)) # this doesn't have to be exactly 'beam'
            words = np.array(words)
            idx = np.indices((beam, words.shape[1]))[0]
            tot_scores = scores[np.newaxis].T + np.log(probs)[idx, words]
                
            idx = tot_scores.ravel().argpartition(-beam)[-beam:]
            i,j = divmod(idx, tot_scores.shape[1])

            scores = tot_scores[i,j]

            candidates = np.hstack([candidates[i], words[i,j][np.newaxis].T])
            
#     print candidates[:,:10]
#     print scores[:10]
        
    cands = candidates[candidates[:,-1] == pad_value]
    if cands.size > 0:
        return candidates[candidates[:,-1] == pad_value][0]
    return candidates[scores.argmax()]

In [41]:
utt = beam_search(net.get_probs_fn, init_seq='', beam=10, mode='rr')

text = map(lambda i: idx_to_w[i], list(utt))
' '.join([t for t in text])# if t not in ['<s>', '</s>', '<utt_end>']])

"<s> excuse me ? </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a <unk> . </s> <s> i ' m not going to be a long time . </s> <s> i ' m not . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a lot of <unk> . </s> <s> i ' m not a <unk> . </s> <s> i ' m not going to be a lot of <unk> . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a lot of <unk> . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a man . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a <unk> . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to be a lot of <unk> . </s> <s> i ' m not a <unk> . </s> <s> i ' m not going to be a lot of time . </s> <s> i ' m not going to t

In [29]:
init_seq = ''
utt = [1] + map(lambda w: w_to_idx.get(w, w_to_idx['<unk>']), init_seq.split())
utt = np.asarray(utt, dtype=np.int32)[np.newaxis]

i = 0
while utt[0,-1] != -1 and i < 100:
    word_probs = net.get_probs_fn(utt)[0]
    next_idx = rnd_next_word(word_probs)
    utt = np.append(utt, next_idx)[np.newaxis].astype(np.int32)
    i += 1
    
text = map(lambda i: idx_to_w[i], list(utt[0]))
' '.join([t for t in text if t not in ['<s>', '</s>', '<utt_end>']])

"so you do big bones . an instant spa news name ? what do you remember she seem good about your production assistant ? right with the bouquet after the street . these that ' s always a gentlemen , as it would not happen . one of the premium sessions . that was they is innocent by somebody ' s making ok , i can bust four hundred examiner hundred up , <number> authority -- it may sign of me too you called the f <unk> chamber , exactly what is that ?"