In [1]:
import numpy as np
import theano
import theano.tensor as T
import time
from itertools import chain

import lasagne as L

from SimpleRNNLM import SimpleRNNLM, iterate_minibatches

Using gpu device 0: GeForce GTX 970 (CNMeM is enabled with initial size: 30.0% of memory, cuDNN 5005)
  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [2]:
# remember, now the pad value is the same as the <utt_end> token

pad_value = -1 # <utt_end>'s vector is the last one

def split_utt(utt):
    u1, u2, u3 = [i for i,j in enumerate(utt) if j == 1]
    return [utt[:u2], utt[u2:u3], utt[u3:]]

In [3]:
mt_path = "/pio/data/data/mtriples/"
mt_path = "/home/maciek/Desktop/mgr/DATA/MovieTriples_Dataset/"

def load_mt(path=mt_path):
    tr = np.load(mt_path + 'Training.triples.pkl')
    vl = np.load(mt_path + 'Validation.triples.pkl')
    ts = np.load(mt_path + 'Test.triples.pkl')
    
    tr = chain(*map(split_utt, tr))
    vl = chain(*map(split_utt, vl))
    ts = chain(*map(split_utt, ts))
    
    return tr, vl, ts

train, valid, test = load_mt()

train = [utt for utt in train if len(utt) < 200]
valid = [utt for utt in valid if len(utt) < 200]
test  = [utt for utt in test  if len(utt) < 200]


def get_mt_voc(mt_path=mt_path, train_len=len(train)):
    word_list = np.load(mt_path + 'Training.dict.pkl')
    word_list.sort(key=lambda x: x[1])
    freqs = np.array(map(lambda x: x[2], word_list) + [train_len])
    total_count = float(sum(freqs))
    
    words = map(lambda x: x[:2], word_list)
    
    w_to_idx = dict(words)
    w_to_idx['<utt_end>'] = pad_value
    idx_to_w = {v : k for (k,v) in w_to_idx.items()}
    
    return idx_to_w, w_to_idx, len(w_to_idx), freqs / total_count

idx_to_w, w_to_idx, voc_size, freqs = get_mt_voc()

In [4]:
word2vec_embs, word2vec_embs_mask = np.load(mt_path + 'Word2Vec_WordEmb.pkl')
word2vec_embs = np.vstack([word2vec_embs, L.init.GlorotUniform()((1,300))]).astype(np.float32)
word2vec_embs_mask = np.vstack([word2vec_embs_mask, np.ones((1,300))])

w2v_train_mask = np.where(word2vec_embs_mask[:,0] == 1)[0]

In [5]:
net = SimpleRNNLM(voc_size=voc_size,
                  emb_size=300,
                  rec_size=300,
                  mode='ssoft',
                  num_sampled=200,
                  emb_init=word2vec_embs,
                  ssoft_probs=freqs)

Building the model...
Compiling theano functions...
Building a network for generation...
Done


In [13]:
# net.load_params(fname='5ep_w2vInit_300_300_ssoft(uni,200,non-unique)_bs50_cut200.npz')

In [6]:
net.train_model(num_epochs=5,
                path='sentences_5ep_w2vInit_300_300_ssoft(uni,200,non-unique)_bs50_cut200.npz',
                save_params=True,
                train_batch_size=50,
                train_data=train,
                val_batch_size=25,
                val_data=valid)

Done 10 batches in 1.41s	training loss:	7.494860
Done 20 batches in 2.98s	training loss:	6.691600
Done 30 batches in 4.30s	training loss:	6.356645
Done 40 batches in 5.98s	training loss:	6.137196
Done 50 batches in 7.51s	training loss:	5.986012
Done 60 batches in 9.01s	training loss:	5.854042
Done 70 batches in 10.15s	training loss:	5.732083
Done 80 batches in 11.49s	training loss:	5.626317
Done 90 batches in 13.05s	training loss:	5.543616
Done 100 batches in 14.36s	training loss:	5.464922
Done 110 batches in 15.65s	training loss:	5.397891
Done 120 batches in 17.06s	training loss:	5.333810
Done 130 batches in 18.37s	training loss:	5.279024
Done 140 batches in 19.90s	training loss:	5.230633
Done 150 batches in 21.53s	training loss:	5.188017
Done 160 batches in 23.08s	training loss:	5.147366
Done 170 batches in 24.40s	training loss:	5.105820
Done 180 batches in 26.07s	training loss:	5.068550
Done 190 batches in 27.38s	training loss:	5.032651
Done 200 batches in 29.06s	training loss:	5.00

In [7]:
def rnd_next_word(probs, size=1):
    return np.random.choice(np.append(np.arange(probs.shape[0]-1), -1).astype(np.int32), 
                            size=size, p=probs)

def beam_search(get_probs_fun, beam=10, init_seq='', mode='rr'):
    utt = map(lambda w: w_to_idx.get(w, w_to_idx['<unk>']), init_seq.split())
    if len(utt) == 0 or utt[0] != 1:
        utt = [1] + utt
    utt = np.asarray(utt, dtype=np.int32)[np.newaxis]
    
    if mode[0] == 's':
        words = get_probs_fun(utt)[0].argpartition(-beam)[-beam:].astype(np.int32)
        words[words==voc_size-1] = pad_value
    elif mode[0] == 'r':
        words = rnd_next_word(get_probs_fun(utt)[0], beam)
    
    candidates = utt.repeat(beam, axis=0)
    candidates = np.hstack([candidates, words[np.newaxis].T])
    scores = np.zeros(beam)
    
#     print candidates
    
    while candidates.shape[1] < 100 and pad_value not in candidates[:,-1]:
        
        if mode[1] == 's':
            log_probs = np.log(get_probs_fun(candidates))
            tot_scores = log_probs + scores[np.newaxis].T

            idx = tot_scores.ravel().argpartition(-beam)[-beam:]
            i,j = divmod(idx, tot_scores.shape[1])
            j[j==voc_size-1] = pad_value
            
            scores = tot_scores[i,j]

            candidates = np.hstack([candidates[i], j[np.newaxis].T.astype(np.int32)])
            
        elif mode[1] == 'r':
            probs = get_probs_fun(candidates)
            words = []
            for k in xrange(beam):
                words.append(rnd_next_word(probs[k], beam)) # this doesn't have to be exactly 'beam'
            words = np.array(words)
            idx = np.indices((beam, words.shape[1]))[0]
            tot_scores = scores[np.newaxis].T + np.log(probs)[idx, words]
                
            idx = tot_scores.ravel().argpartition(-beam)[-beam:]
            i,j = divmod(idx, tot_scores.shape[1])

            scores = tot_scores[i,j]

            candidates = np.hstack([candidates[i], words[i,j][np.newaxis].T])
            
#     print candidates[:,:10]
#     print scores[:10]
        
    cands = candidates[candidates[:,-1] == 0]
    if cands.size > 0:
        return candidates[candidates[:,-1] == 0][0]
    return candidates[scores.argmax()]

In [116]:
utt = beam_search(net.get_probs_fn, init_seq='', beam=2, mode='rr')

text = map(lambda i: idx_to_w[i], list(utt))
' '.join([t for t in text if t not in ['<s>', '</s>', '<utt_end>']])

"that ' s what i ' m saying ."

In [100]:
init_seq = ''
utt = [1] + map(lambda w: w_to_idx.get(w, w_to_idx['<unk>']), init_seq.split())
utt = np.asarray(utt, dtype=np.int32)[np.newaxis]

i = 0
while utt[0,-1] != -1 and i < 100:
    word_probs = net.get_probs_fn(utt)[0]
    next_idx = rnd_next_word(word_probs)
    utt = np.append(utt, next_idx)[np.newaxis].astype(np.int32)
    i += 1
    
text = map(lambda i: idx_to_w[i], list(utt[0]))
' '.join([t for t in text if t not in ['<s>', '</s>', '<utt_end>']])

'<person> <unk> ? got some help ! !'