In [114]:
# HRED (https://arxiv.org/pdf/1507.02221.pdf [*])

import theano
import theano.tensor as T
import numpy as np

In [115]:
# MODEL PARAMETERS
# notation similar to [*]

V = 10 # vocabulary size
d_h = 5
d_s = 5
d_e = 5


model_params = []

# Training sample

Utts = T.tensor3('Utts', dtype='float32') # ut_count x ut_len x V


# Encoder

Ir_enc = theano.shared(np.zeros((d_h, V), dtype='float32'), name='Ir_enc')
Iu_enc = theano.shared(np.zeros((d_h, V), dtype='float32'), name='Iu_enc')
I_enc  = theano.shared(np.zeros((d_h, V), dtype='float32'), name='I_enc')

Hr_enc = theano.shared(np.zeros((d_h, d_h), dtype='float32'), name='Hr_enc')
Hu_enc = theano.shared(np.zeros((d_h, d_h), dtype='float32'), name='Hu_enc')
H_enc  = theano.shared(np.zeros((d_h, d_h), dtype='float32'), name='H_enc')

model_params += [Ir_enc, Iu_enc, I_enc, Hr_enc, Hu_enc, H_enc]


# Session context

Ir_ses = theano.shared(np.zeros((d_s, d_h), dtype='float32'), name='Ir_ses')
Iu_ses = theano.shared(np.zeros((d_s, d_h), dtype='float32'), name='Iu_ses')
I_ses  = theano.shared(np.zeros((d_s, d_h), dtype='float32'), name='I_ses')

Hr_ses = theano.shared(np.zeros((d_s, d_s), dtype='float32'), name='Hr_ses')
Hu_ses = theano.shared(np.zeros((d_s, d_s), dtype='float32'), name='Hu_ses')
H_ses  = theano.shared(np.zeros((d_s, d_s), dtype='float32'), name='H_ses')

model_params += [Ir_ses, Iu_ses, I_ses, Hr_ses, Hu_ses, H_ses]


# Decoder

Ir_dec = theano.shared(np.zeros((d_h, V), dtype='float32'), name='Ir_dec')
Iu_dec = theano.shared(np.zeros((d_h, V), dtype='float32'), name='Iu_dec')
I_dec  = theano.shared(np.zeros((d_h, V), dtype='float32'), name='I_dec')

Hr_dec = theano.shared(np.zeros((d_h, d_h), dtype='float32'), name='Hr_dec')
Hu_dec = theano.shared(np.zeros((d_h, d_h), dtype='float32'), name='Hu_dec')
H_dec  = theano.shared(np.zeros((d_h, d_h), dtype='float32'), name='H_dec')

D = theano.shared(np.zeros((d_h, d_s), dtype='float32'), name='D')
b = theano.shared(np.zeros((d_h,), dtype='float32'), name='b')

Ho = theano.shared(np.zeros((d_e, d_h), dtype='float32'), name='Ho')
Eo = theano.shared(np.zeros((d_e, V), dtype='float32'), name='Eo')
bo = theano.shared(np.zeros((d_e,), dtype='float32'), name='bo')

O = theano.shared(np.zeros((V, d_e), dtype='float32'), name='O')

model_params += [Ir_dec, Iu_dec, I_dec, Hr_dec, Hu_dec, H_dec, D, b, Ho, Eo, bo, O]

In [116]:
# GRU gate

def gru(w, h, Ir, Iu, I, Hr, Hu, H): # w = current input, h = current hidden state
    r = T.nnet.sigmoid(T.dot(Ir,w) + T.dot(Hr,h))
    u = T.nnet.sigmoid(T.dot(Iu,w) + T.dot(Hu,h))
    h_ = T.tanh(T.dot(I,w) + T.dot(H,r*h))
    h_next = (1-u)*h + u*h_
    return h_next


# Networks' steps

def step_enc(w, h_prev, Ir_enc, Iu_enc, I_enc, Hr_enc, Hu_enc, H_enc):
    return gru(w, h_prev, Ir_enc, Iu_enc, I_enc, Hr_enc, Hu_enc, H_enc)

def step_ses(q, s_prev, Ir_ses, Iu_ses, I_ses, Hr_ses, Hu_ses, H_ses):
    return gru(q, s_prev, Ir_ses, Iu_ses, I_ses, Hr_ses, Hu_ses, H_ses)

def omega(d, w, Ho, Eo, bo):
    return T.dot(Ho, d) + T.dot(Eo,w) + bo

def step_dec(w, w_prev, d_prev, Ir_dec, Iu_dec, I_dec, Hr_dec, Hu_dec, H_dec, Ho, Eo, bo, O):
    d_next = gru(w, d_prev, Ir_dec, Iu_dec, I_dec, Hr_dec, Hu_dec, H_dec)
    out = T.dot(T.dot(O.T,w), omega(d_prev, w_prev, Ho, Eo, bo))
    return [d_next, out]

In [117]:
def h_final(U):
    h0 = T.zeros((d_h,), dtype='float32')
    hs, _ = theano.scan(fn=step_enc,
                        sequences=U,
                        outputs_info=h0,
                        non_sequences=[Ir_enc, Iu_enc, I_enc, Hr_enc, Hu_enc, H_enc],
                        strict=True)
    return hs[-1]

In [118]:
def contexts(qs):
    s0 = T.zeros((d_s,), dtype='float32')
    ss, _ = theano.scan(fn=step_ses,
                        sequences=qs,
                        outputs_info=s0,
                        non_sequences=[Ir_ses, Iu_ses, I_ses, Hr_ses, Hu_ses, H_ses],
                        strict=True)
    return ss # doesn't contain s0

In [120]:
def dec(U, s, D, b): # s = context
    d0 = T.tanh(T.dot(D,s) + b)
    [ds,os], _ = theano.scan(fn=step_dec,
                             sequences=[U, T.concatenate([T.zeros_like(U[0]).dimshuffle(0,'x'), U])],
                             outputs_info=[d0,None],
                             non_sequences=[Ir_dec, Iu_dec, I_dec, Hr_dec, Hu_dec, H_dec, Ho, Eo, bo, O],
                             strict=True)
    return [ds,os]

In [8]:
# crap below
#
#nll = 0
#
#s = T.zeros((d_s,), dtype='float32')
#for U in [U1,U2,U3]:
#    h = T.zeros((d_h,), dtype='float32')
#    d = T.tanh(T.dot(D,s) + b)
#    w_prev = None
#    for w in U.eval():
#        nll -= lprob(w, d, w_prev)
#        h = gru_w(Ir_enc, Iu_enc, I_enc, w, Hr_enc, Hu_enc, H_enc, h)
#        d = gru_w(Ir_dec, Iu_dec, I_dec, w, Hr_dec, Hu_dec, H_dec, d)        
#        w_prev = w        
#    s = gru_h(Ir_ses, Iu_ses, I_ses, h, Hr_ses, Hu_ses, H_ses, s)
#    
#
#cost = theano.function([], nll)
#
#def gru_w(Ir, Iu, I, w, Hr, Hu, H, h):
#    r = T.nnet.sigmoid(Ir[:,w] + T.dot(Hr,h))
#    u = T.nnet.sigmoid(Iu[:,w] + T.dot(Hu,h))
#    h_ = T.tanh(I[:,w] + T.dot(H,r*h))
#    h_next = (1-u)*h + u*h_
#    return h_next