Load model and embedding data.

In [22]:
import theano
import theano.tensor as T
import numpy as np
import cPickle
import random
from utils import *
from state import *
from title_model import TitleModel

theano.config.floatX='float64'

model_name = 'model/title_emb256_h256_f32_tok10_model.npz'

state = title_state()
model = TitleModel(state, test_mode=True)
model.load(model_name)

(ind2word, word2ind, _, _, _) = cPickle.load(open('data/dict.pkl'))

print('Data loaded.')

Data loaded.


Now we try to mannually provide an input sentence (if a word is out of vocab, we simply remove it).

In [24]:
def restoreW(ind_lst, ind2word, tmp_map):
    res = []
    for ind in ind_lst:
        if ind in tmp_map:
            res.append(tmp_map[ind])
        else:
            res.append(ind2word[ind])
    return ' '.join(res)
    

test_sents = ['i would like to go hiking tomorrow', 'i want to sleep all day', 'i want to play video games', \
              'i want to visit my dental', 'i will wash my car this afternoon', 'i want to play chess this afternoon']

for (k, test_sent) in enumerate(test_sents):
    print "Test sent:", test_sent
    words = test_sent.split()
    nat_coded = [1]
    tmp_map = {}
    tok_set = range(10)
    for w in words:
        if w in word2ind:
            nat_coded.append(word2ind[w])
        else:
            tok_ind = random.choice(tok_set)
            tok_s = '<TOK%d>' % tok_ind
            tok_set.remove(tok_ind)
            nat_coded.append(word2ind[tok_s])
            tmp_map[word2ind[tok_s]] = w
            print '  out of vocab: %s, replaced with %s' % (w, tok_s)
    nat_coded.append(0)
    print 'Coded input:', nat_coded
    print restoreW(nat_coded, ind2word, tmp_map)

    m = state['seq_len_in']
    nat_coded_mat = numpy.zeros((m, 2), dtype='int32')
    nat_mask = numpy.zeros((m, 2), dtype='float64')
    sent_len = len(nat_coded)
    nat_coded_mat[:sent_len, 0] = nat_coded
    nat_mask[:sent_len, 0] = 1
    nat_coded_mat[:sent_len, 1] = nat_coded
    nat_mask[:sent_len, 1] = 1
    pred_fn = model.build_gen_function()
    
    res = [1]
    abs_in = 1
    model.gen_reset()
    while True:
        abs_in_mat = np.zeros((2, ), dtype='int32') + abs_in
        #print 'abs_in', abs_in_mat
        [p_t, o_t, alpha_t] = pred_fn(nat_coded_mat, nat_mask, abs_in_mat)
        #print "ot", o_t, ind2word[o_t[0]]
        pt_col = p_t[0]
        alpha_t = alpha_t[:, 0]
        #print alpha_t
        alpha_s = alpha_t.argsort()[::-1]
        #print sum(pt_col)
        pt_norm = [1.0 * a / sum(pt_col) for a in pt_col]
        #print pt_norm
        ind = np.asarray(pt_norm).argmax()
        abs_in = ind
        res.append(ind)
        if ind == 0 or len(res) > 10:        
            break
        
        print 'Explanation of: %s' % restoreW([ind], ind2word, tmp_map)
        for k in alpha_s[:len(nat_coded)]:
            print "    %s: %.4f" % (restoreW([nat_coded[k]], ind2word, tmp_map), alpha_t[k])
    print
    print restoreW(res, ind2word, tmp_map)
    print

Test sent: i would like to go hiking tomorrow
  out of vocab: hiking, replaced with <TOK1>
  out of vocab: tomorrow, replaced with <TOK4>
Coded input: [1, 2148, 106, 2186, 1155, 1737, 3, 6, 0]
<START> i would like to go hiking tomorrow <END>
Explanation of: go
    hiking: 0.8719
    tomorrow: 0.1056
    <START>: 0.0081
    would: 0.0046
    <END>: 0.0044
    i: 0.0027
    like: 0.0015
    go: 0.0012
    to: 0.0001
Explanation of: hiking
    hiking: 0.8719
    tomorrow: 0.1056
    <START>: 0.0081
    would: 0.0046
    <END>: 0.0044
    i: 0.0027
    like: 0.0015
    go: 0.0012
    to: 0.0001

<START> go hiking <END>

Test sent: i want to sleep all day
  out of vocab: sleep, replaced with <TOK9>
Coded input: [1, 2148, 148, 1155, 11, 1538, 1892, 0]
<START> i want to sleep all day <END>
Explanation of: sleep
    sleep: 0.7934
    all: 0.0841
    <START>: 0.0477
    want: 0.0363
    i: 0.0160
    day: 0.0107
    <END>: 0.0106
    to: 0.0012
Explanation of: <TOK7>
    sleep: 0.7934
    all: 