In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
# import phrase2matrix, tokens, token2idx, UNK_ix, EOS_ix
from mymodule.data_stuff import *
from collections import deque
import pickle
import tqdm

# %env THEANO_FLAGS=device=gpu5,floatX=float32,exception_verbosity=high,lib.cnmem=0.95,mode=FAST_RUN
from warnings import warn
import numpy as np
import theano
import theano.tensor as T
from collections import OrderedDict
import lasagne

from lasagne.layers import *

env: THEANO_FLAGS=device=gpu5,floatX=float32,exception_verbosity=high,lib.cnmem=0.95


Using gpu device 5: GeForce GTX 1080 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 5105)


In [2]:
def read_all_contexts(context_size=2, verbose=100000):
    with open('./open_subtitles_en_raw') as fin:
        lines = [l.strip() for l in fin]

    contexts = []
    curr_context = deque(lines[:context_size], context_size)
    curr_answer = lines[context_size]
    
    t = 0
    for line in lines[context_size+1:]:
        contexts.append({'context':list(curr_context), 'answer': curr_answer})

        if t % verbose == 0:
            print(t)
        curr_context.append(curr_answer)
        curr_answer = line.strip()

        t += 1
    return contexts

In [3]:
with open('contexts.pkl', 'rb') as fin:
    contexts = pickle.load(fin)

# Actor-Critic Dialogue Model 

**! (symbolic expressions start from here)**

In [4]:
GRAD_CLIP = 5
N_LSTM_UNITS = 1024
EMB_SIZE = 512
BOTTLENECK_UNITS = 256


TEMPERATURE = theano.shared(np.float32(1.), name='temperature')

## Encoder

* Just convolves sequence of input words into final hidden vector (so outputs [batch_size, N_LSTM_UNITS])

In [5]:
from pretrained_network.wrong_lstm_layer import WrongLSTMLayer
class Enc:
    ### THEANO GRAPH INPUT ###
    input_phrase = T.imatrix("encoder phrase tokens")
    ##########################
    
    l_in = InputLayer((None, None), input_phrase, name='context input')
    l_mask = InputLayer((None, None), T.neq(input_phrase, PAD_ix), name='context mask')
    
    l_emb = EmbeddingLayer(l_in, N_TOKENS, EMB_SIZE, name="context embedding")
    
    
    ####LSTMLayer with incorrect outputgate####
    
    l_lstm = LSTMLayer(
                        l_emb,
                        N_LSTM_UNITS,
                        name='encoder_lstm',
                        grad_clipping=GRAD_CLIP,
                        mask_input=l_mask,
                        only_return_final=True,
                        peepholes=False)
    
    weights = get_all_params(l_lstm, trainable=True)
    
    output = l_lstm

## This defines one step of decoder.

* Decoder takes next things as input (at each tick!): ``(prev_cell, prev_hid, inp_word, encoder_output)``
* Decoder makes computations and output: ``(next_cell, next_hid, next_word)`` which will be passed as inputs at the next tick.

In [6]:
from pretrained_network.wrong_lstm_cell import WrongLSTMCell
from agentnet import Recurrence
from agentnet.resolver import  ProbabilisticResolver
from agentnet.memory import LSTMCell

class Dec:
    # Define inputs of decoder at each time step.
    prev_cell = InputLayer((None, N_LSTM_UNITS), name='cell')
    prev_hid = InputLayer((None, N_LSTM_UNITS), name='hid')
    input_word = InputLayer((None,))
    encoder_lstm = InputLayer((None, N_LSTM_UNITS), name='encoder')

    
    # Embed input word and use the same embeddings as in the encoder.
    word_embedding = EmbeddingLayer(input_word, N_TOKENS, EMB_SIZE,
                                    W=Enc.l_emb.W, name='emb')
    
    
    # This is not WrongLSTMLayer! *Cell is used for one-tick networks.
    new_cell, new_hid = LSTMCell(prev_cell, prev_hid,
                                      input_or_inputs=[word_embedding, encoder_lstm],
                                      name='decoder_lstm',
                                      peepholes=False)
    
    # Define parts for new word prediction. Bottleneck is a hack for reducing time complexity.
    bottleneck = DenseLayer(new_hid, BOTTLENECK_UNITS, nonlinearity=T.tanh, name='decoder intermediate')

    
    next_word_probs = DenseLayer(bottleneck, N_TOKENS,
                                 nonlinearity=lambda probs: T.nnet.softmax(probs/TEMPERATURE),
                                 name='decoder next word probas')

    next_words = ProbabilisticResolver(next_word_probs, assume_normalized=True)

## Generator unrolls decoder for ``n_steps``

* It uses ``Recurrence`` class from ``agentnet``.

* ``theano.scan`` fn takes arguments in order: ``seq1, seq2,..,output1,output2,..,nonseq1,nonseq2,...``
* ``Recurrence`` unrolls ``Dec`` for `n_steps`. 

In [7]:
class Gen:
    n_steps = theano.shared(25)
    # This theano tensor is used as first input word for decoder.
    bos_input_var = T.zeros((Enc.input_phrase.shape[0],), 'int32')+BOS_ix
    
    bos_input_layer = InputLayer((None,), bos_input_var, name="first input")

    recurrence = Recurrence(
        # This means that encoder.output passed to decoder.encoder_lstm input at each tick.
        input_nonsequences={Dec.encoder_lstm: Enc.output},
        
        # This defines how outputs moves to inputs at each tick in decoder. 
        # These corresponds to outputs in theano scan function.
        state_variables=OrderedDict([(Dec.new_cell, Dec.prev_cell),
                                     (Dec.new_hid, Dec.prev_hid),
                                     (Dec.next_words, Dec.input_word)]),
        # We will need these probabilities for Actor-Critic algorithm.
        tracked_outputs=[Dec.next_word_probs],
        state_init={Dec.next_words: bos_input_layer},
        n_steps=n_steps,
        unroll_scan=False)
    
    recurrence_outputs = get_output(recurrence)
        
    ##### DECODER UNROLLED #####
    # Theano tensor which represents sequence of generated words and their probabilities.
    words_seq = recurrence_outputs[Dec.next_words]
    words_probs_seq = recurrence_outputs[Dec.next_word_probs]
    
    # Theano tensor which represents decoder hidden states.
    dec_cell_seq = recurrence_outputs[Dec.new_cell]
    dec_hid_seq = recurrence_outputs[Dec.new_hid]
    ############################
                                                 
    generate = theano.function([Enc.input_phrase], [words_seq, dec_cell_seq, dec_hid_seq],
                               updates=recurrence.get_automatic_updates())
    
    weights = get_all_params(recurrence, trainable=True)
    
    @staticmethod
    def reply(phrase, max_len=25, **kwargs):
        old_value = Gen.n_steps.get_value()
        
        Gen.n_steps.set_value(max_len)
        phrase_ix = phrase2matrix([phrase],**kwargs)
        answer_ix = Gen.generate(phrase_ix)[0][0]
        if EOS_ix in answer_ix:
            answer_ix = answer_ix[:list(answer_ix).index(EOS_ix)]
            
        Gen.n_steps.set_value(old_value)
        return ' '.join(map(tokens.__getitem__, answer_ix))
        
class GenTrain:
    """contains a recurrent loop where network is fed with reference answers instead of her own outputs.
    Also contains some functions that train network in that mode."""
    
    ### THEANO GRAPH INPUT. ###
    reference_answers = T.imatrix("decoder reference answers") # shape [batch_size, max_len]
    ###########################
    
    bos_column = T.zeros((reference_answers.shape[0], 1), 'int32')+BOS_ix
    reference_answers_bos = T.concatenate((bos_column, reference_answers), axis=1)  #prepend BOS
    
    l_ref_answers = InputLayer((None, None), reference_answers_bos, name='context input')
    l_ref_mask = InputLayer((None, None), T.neq(reference_answers_bos, PAD_ix), name='context mask')
    
    recurrence = Recurrence(
        input_nonsequences=OrderedDict([(Dec.encoder_lstm, Enc.output)]),
        input_sequences=OrderedDict([(Dec.input_word, l_ref_answers)]),
        state_variables=OrderedDict([(Dec.new_cell, Dec.prev_cell),
                                     (Dec.new_hid, Dec.prev_hid)]),
        tracked_outputs=[Dec.next_word_probs, Dec.next_words],
        mask_input=l_ref_mask,
        unroll_scan=False)
    
    recurrence_outputs = get_output(recurrence)
    
    P_seq = recurrence_outputs[Dec.next_word_probs]
    
    
    ############################
    ###loglikelihood training###
    ############################
    predicted_probas = P_seq[:, :-1].reshape((-1, N_TOKENS))+1e-6
    target_labels = reference_answers.ravel()
    
    llh_loss = lasagne.objectives.categorical_crossentropy(predicted_probas, target_labels).mean()
    
    llh_updates = lasagne.updates.adam(llh_loss, Gen.weights, 0.001)
    
    train_step = theano.function([Enc.input_phrase, reference_answers], llh_loss,
                                 updates=llh_updates+recurrence.get_automatic_updates())
    get_llh = theano.function([Enc.input_phrase, reference_answers], llh_loss, no_default_updates=True)

(A) If you wanted to get two outputs from recurrence, use NOT
>>>out1 = get_output(rec[layer1])
>>>out2 = get_output(rec[layer2])
but instead:
>>>out1,out2 = get_output((rec[layer1],rec[layer2])) #or rec[layer1,layer2].
(C) If you want to get rid of old updates, use get_output(...,accumulate_updates=False)

  default_warn("[Verbose>=%s] %s"%(verbosity_level,message),**kwargs)
  default_warn("[Verbose>=%s] %s"%(verbosity_level,message),**kwargs)
Make sure it is always above <unspecified>(n_steps) you specified for recurrence
  default_warn("[Verbose>=%s] %s"%(verbosity_level,message),**kwargs)
(A) If you wanted to get two outputs from recurrence, use NOT
>>>out1 = get_output(rec[layer1])
>>>out2 = get_output(rec[layer2])
but instead:
>>>out1,out2 = get_output((rec[layer1],rec[layer2])) #or rec[layer1,layer2].
(C) If you want to get rid of old updates, use get_output(...,accumulate_updates=False)

  default_warn("[Verbose>=%s] %s"%(verbosity_level,message),**kwargs)
  default_warn("[Verb

----
# Actor-Critic part

__Load target list of words list__

In [8]:
with open('./obscene_words.txt') as fin:
    target_words = set()
    for line in fin:
        words_in_line = line.strip().split()
        if len(words_in_line) == 1:
            target_words.update(words_in_line)

# target_words = list('.!,?')

target_words.add('suck')
target_words.add('goddamn')
target_words.add('motherfucker')
target_words.add('nigger')
target_words.add('nigga')
target_words.add('ass')
target_words.add('crap')
target_words.remove('fucking')
target_words.remove('no')
print("Some examples %s" % list(target_words)[:10])

target_idxs = set(filter(lambda x: x != UNK_ix, [token2idx[w] for w in target_words]))

target_idxs_shared = theano.shared(np.array(list(target_idxs)))
print("Number of target words %d" % len(target_idxs))

Some examples ['asswhole', 'moan', 'doggin', 'fistfucks', 'sadism', 'motherfucked', 'Paddy', 'porchmonkey', 'negress', 'boner']
Number of target words 237


**Hack for symbolic computation of rewards**:
1. Let suppose that we have list of tokens = [25,90,102]
2. And we also have generated batch with shape [batch_size, n_steps] with word indices.
3. For each word we want to check whether it is equals to any of tokens in our list.

We can do this using theano (same as numpy) broadcasting:

``T.eq(tokens[None,None,:], batch[:,:,None]).any(-1)``

In [9]:
def _calc_rewards(symbolic_batch):
    assert symbolic_batch.ndim == 2
    rewards = T.eq(target_idxs_shared[None, None, :], symbolic_batch[:, :, None]).any(-1)
    rewards = T.cast(rewards, 'int32')
    assert rewards.ndim == 2

    # Find EOS_ix in batch
    done_mask = T.eq(symbolic_batch, EOS_ix)
    # Set done==True for all words after EOS_ix
    done_mask = T.concatenate([T.zeros_like(done_mask[:,:1]), done_mask[:,:-1]], axis=1)
    
    is_alive = T.eq(T.cumsum(done_mask, axis=1), 0).astype('uint8')
    return rewards, is_alive

## Critic

* Critic uses decoder hidden+cell vectors as state.
* Critic evaluates state values for `[hid_t, cell_t]` before predicting `word_t`.
* As a reward for being in state `[hid_t, cell_t]` and generating word `word_t` we take 1 if word `word_t` is obscene, 0 otherwise.

* During training, we can compute these rewards trivially (we just need to collect a dictionary of obscene words).
* We use TD updates $V(s) \leftarrow V(s) + \alpha * (R + V(s') - V(s))$; for function approximation, its equivalent to ``new_weights = old_weights - alpha * grad(MSE(targetV, approxV), old_weights)``

In [20]:
from agentnet.learning.generic import get_n_step_value_reference
from theano.gradient import disconnected_grad

class Critic:
    rewards, is_alive = _calc_rewards(Gen.words_seq)  # both with shape [batch_size, n_steps]
    rewards = -1 * rewards ## PREVENT FROM BAD WORDS
    
    critic_input_var = T.concatenate([Gen.dec_cell_seq, Gen.dec_hid_seq], axis=2)
    
    l_dec_cell_seq = InputLayer((None, None, 2*N_LSTM_UNITS), input_var=critic_input_var, name='l_decoder_sequence')
    
    l_critic_values = DenseLayer(l_dec_cell_seq, num_units=2048, num_leading_axes=2, name="critic_dense1")
    l_critic_values = DenseLayer(l_critic_values, num_units=1024, num_leading_axes=2, name="critic_dense2")
    l_critic_values = DenseLayer(l_critic_values, num_units=512, num_leading_axes=2, name="critic_dense3")
    l_critic_values = DenseLayer(l_critic_values, num_units=1, num_leading_axes=2, 
                                 nonlinearity=None, name='critic_values')
    
    critic_values_seq = get_output(l_critic_values)
    # Now its shape [batch, n_steps, 1]. Reshape it to [batch, n_steps]
    _old_shape = critic_values_seq.shape
    critic_values_seq = critic_values_seq.reshape((_old_shape[0], _old_shape[1]))
    
    predict = theano.function([Enc.input_phrase], [Gen.words_seq, critic_values_seq, rewards, is_alive],
                              allow_input_downcast=True, no_default_updates=True)
    
    predict_values_from_decoder = theano.function([Gen.dec_cell_seq, Gen.dec_hid_seq], critic_values_seq,
                                           allow_input_downcast=True, no_default_updates=True)
    
    weights = get_all_params(l_critic_values, trainable=True)

class CriticTrainer:
    td_n_steps = 1
    
    V_predicted = Critic.critic_values_seq
    
    V_reference = get_n_step_value_reference(state_values=V_predicted,
                                             rewards=Critic.rewards,
                                             is_alive=Critic.is_alive,
                                             n_steps=td_n_steps)
    
    # We must not propagate grads through target value (semi-gradient method).
    V_reference = disconnected_grad(V_reference)
    
    td_loss = lasagne.objectives.squared_error(V_predicted, V_reference).sum(axis=1).mean()
    td_updates = lasagne.updates.adam(td_loss, Critic.weights)
    
    train_step = theano.function([Enc.input_phrase], [td_loss, V_predicted, V_reference, Critic.rewards, Critic.is_alive],
                                 updates=td_updates+Gen.recurrence.get_automatic_updates(),
                                 allow_input_downcast=True)
    
    
CT = CriticTrainer

## Actor

Actor need to update policy (which is defined by `Gen`) using policy gradient: $\nabla\log\pi(a|s)\cdot A(s,a)$, where $A(s,a)$ is the advantage function of being in state $s$ and doing action $a$. In our case, $A(s,a) = R_1+\gamma R_2+\dots+\gamma^nV(s')-V(s)$

In code, the scheme looks like this:
* Take __generated batch__ (actions, needed for computing rewards), corresponding __`dec_cell_seq, dec_hid_seq`__ (states, needed for computing advantage function) and __probabilities of words__ on each tick (policy, we will update it).
* Calculate `rewards` and `is_alive` mask for this batch.
* Calculate state-values using critic and target values (n_steps TD estimation of Q-function, e.g. `Critic.V_reference`). Compute advantage function and disconnect gradient through it.

In [21]:
class ActorTrainer:
    LLH_ALPHA = theano.shared(5.0)
    
    actions = Gen.words_seq  # shape [batch_size, n_steps]
    _actions_ravel = actions.ravel()  # shape [batch_size*n_steps,]
    
    _word_probs = Gen.words_probs_seq
    _old_shape = _word_probs.shape
    _word_probs = _word_probs.reshape((-1, _old_shape[-1]))  # shape [batch_size*n_steps, vocab_size]
    
    _policy = _word_probs[T.arange(_word_probs.shape[0]), _actions_ravel]  # shape [batch_size*n_steps,]
    
    policy = _policy.reshape((_old_shape[0], _old_shape[1])) # shape [batch_size, n_steps]
    advantage = CriticTrainer.V_reference-CriticTrainer.V_predicted
    
    pg_loss = (-T.log(policy) * disconnected_grad(advantage)).sum(axis=1).mean()
    llh_loss = GenTrain.llh_loss
    
    loss = pg_loss + LLH_ALPHA * llh_loss
    
#     actor_weights = [param for param in Gen.weights if param not in Enc.weights]
    actor_weights = actor_weights = [param for param in Gen.weights]
    
    grads = T.grad(loss, actor_weights)
    grads = lasagne.updates.total_norm_constraint(grads, GRAD_CLIP)
    
    policy_updates = lasagne.updates.adam(grads, actor_weights)
    
    train_step = theano.function([Enc.input_phrase, GenTrain.reference_answers],
                                 [pg_loss, llh_loss, policy, actions, advantage, Critic.rewards, Critic.is_alive],
                                 updates=policy_updates+GenTrain.recurrence.get_automatic_updates()+Gen.recurrence.get_automatic_updates())
    
AT = ActorTrainer

---
## How to train Critic

__How do we train critic?__
* Iterating over batches of data (it will be `Enc.input_phrase`)
* Call `CT.train_step(batch)`.

That's it! Simple.


## How to train Actor-Critic

**Training both actor-critic with policy gradient**:

* Iterating over batches of data.
* Call `loss, policy, actions, advantage, rewards, is_alive = AT.train_step(batch_context, batch_answers)`
* Call `td_loss, _, _, _, _ = CT.train_step(batch_context)`.

## Training Actor-Critic [look at bepolite-experiments.ipynb]