In [1]:
#%env THEANO_FLAGS=device=gpu
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
from utils.southpark import get_conversations
conversations = get_conversations("../../sp.json")

In [3]:
!pip install tqdm
!pip install regex

Collecting tqdm
  Downloading tqdm-4.9.0-py2.py3-none-any.whl (42kB)
[K    100% |████████████████████████████████| 51kB 828kB/s 
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.9.0
[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [8]:
from utils.preprocessor import Preprocessor
#preprocessor = tokenizer + token_to_ix

#one can automatically create one from corpora
preproc = Preprocessor.from_conversations(conversations,max_tokens=30000,verbose=True)


#or create manually from tokens + tokenizer


100%|██████████| 7285/7285 [00:09<00:00, 801.24it/s]


22247 out of 22247 tokens, coverage=1.00000)


In [8]:
print len(conversations)

for speaker,phrase in conversations[42]:
    print speaker,'\t',preproc.preprocess_phrase(phrase)

7285
Garrison:
	son of a bitch ! 
Randy:
	what ?  to take one area of town that was rappy and gentrify it for the local people to enjoy ?  i thought we could keep it contained . 
Victoria:
	it doesn ' t contain .  what ' s happened to south park is happening everywhere .   thirty miles south of here in the town of fairplay ,  they ' ve changed the area north od downtown into nodofopa .  a rundown area south of the capital in cheyenne ,  wyoming ,  is now historic socacheywo .  channel street in mid - chicago is being revitalized into chimichanga . 
Randy:
	oh my god . 
Victoria:
	lodo ,  sobro ,  rivmo ,  all happening at the same time .  and it isn ' t just in the u . s .  in cairo ,  the area northwest of the third pyramid is nowe3pi .  three miles north of auschwitz is nomoauchie .  it goes on and on ! 
Randy:
	what does it mean ? 
Garrison:
	in our town it all started when pc principal arrived .  he ' s part of a mjuch larger conspiracy ,  and you ' re his lackey . 
Randy:
	not me 

# model
See models/twoline_model.py and others for more incomprehensible bullshit

In [12]:
#max phrase length
max_len=15

#dictionary size
n_tokens = len(preproc.tokens)

#n lines in context NOT USED IN THIS EXAMPLE
context_size=3


In [13]:
word_embedding_size = 128
gru0_units=512
grad_clip=5

#IMPORTANT
unroll_scan = False 
#if true, compiles longer, but runs ~20% faster on most GPUs (by unrolling & optimizing recurrent loop)
# as a draw back, if true, you will have to recompile 

In [14]:
import theano
import theano.tensor as T

In [15]:
#input phrase goes here
prev_phrase = T.imatrix('user input line[batch,token_ix]')

batch_size = prev_phrase.shape[0]
#mask-out the PAD tokens for short phrases
prev_phrase_mask = T.neq(prev_phrase,preproc.token_to_ix["PAD"],)

#reference answer goes here
reference_answer = T.imatrix('reference answer[batch,token_ix]')
reference_mask = T.neq(reference_answer,preproc.token_to_ix["PAD"],)

#sampling temperature
greed = theano.shared(np.float32(1.),name='decoder greed')


In [19]:
import lasagne
from lasagne.layers import InputLayer,DenseLayer,GRULayer,EmbeddingLayer
from hsoftmaxplayer import HierarchicalSoftmaxDenseLayer

In [17]:
###ENCODER

#class used as a dictionary, not actual class
class encoder:
    l_in = InputLayer((None,None),prev_phrase,name='prev phrase input')
    l_emb = EmbeddingLayer(l_in,n_tokens,word_embedding_size, name="prev phrase embedding")
    l_mask = InputLayer((None,None),prev_phrase_mask,'prev phrase mask')
    l_gru0 = GRULayer(l_emb,
                      gru0_units,
                      name='gru0',
                      grad_clipping=grad_clip,
                      mask_input = l_mask,
                      only_return_final=True)
    output=l_gru0
    

In [20]:
from agentnet.memory import GRUCell
from agentnet.resolver import ProbabilisticResolver
from agentnet.agent import Recurrence

###one-step update of decoder recurrence. Used for both training and generation.
class decoder_step:
    

    prev_word = InputLayer((None,),name='decoder prev output inp')

    prev_output_emb = EmbeddingLayer(prev_word,
                                     n_tokens,
                                     word_embedding_size,
                                     W=encoder.l_emb.W,
                                     name='decoder prev output emb')

    #previous GRU state goes here
    l_prev_gru0 = InputLayer([None,gru0_units],name='decoder prev gru0')

    #gru update
    l_gru0 = GRUCell(l_prev_gru0,prev_output_emb,name='decoder gru0',grad_clipping=grad_clip)


    #predicted probabilities of next word -- with temperature applied
    next_word_probas = HierarchicalSoftmaxDenseLayer(l_gru0,n_tokens)
    

    #actual next word picked with output probabilities
    next_word = ProbabilisticResolver(next_word_probas,
                                       assume_normalized=False,
                                       name='decoder next letter picker')
        


In [21]:
from hsoftmaxplayer import HierarchicalSoftmaxDenseLayer

In [22]:

###Here we use encoder and decoder_step in the training mode, i.e. feed it with REFERENCE tokens
#and ask to predict next reference tokens
class training:
    
    #get previous reference_answers by shifting tensor to the right by 1 tick (e.g. at t=5, prev is t=4)
    #and padding with full PAD string as the first input
    padding = T.repeat(T.constant(preproc.token_to_ix["EOS"],dtype='int32'),
                       reference_answer.shape[0])

    prev_reference_answer = T.concatenate([padding[:,None],reference_answer[:,:-1]],axis=1)


    l_prev_answers_reference = InputLayer((None,None),prev_reference_answer)

    recurrence = Recurrence(state_variables={decoder_step.l_gru0 : decoder_step.l_prev_gru0},
                            input_sequences={decoder_step.prev_word : l_prev_answers_reference}, 
                            state_init = {decoder_step.l_gru0 : encoder.output}, #<- encoder added here
                            tracked_outputs=(decoder_step.next_word_probas,decoder_step.next_word), 
                            unroll_scan = unroll_scan,
                            n_steps = max_len if unroll_scan else None,
                            )

    state_seqs, (probas_seq,output_tokens_seq) = recurrence.get_sequence_layers()


    #symbolic output sequences
    next_token_probas = lasagne.layers.get_output(probas_seq)

    elementwise_ce = lasagne.objectives.categorical_crossentropy(next_token_probas.reshape([-1,n_tokens]),
                                                                 reference_answer.ravel()
                                                                ).reshape(reference_answer.shape)


    #mean crossentropy
    loss = (elementwise_ce * reference_mask).sum() / reference_mask.sum()


    #all network weights
    params = lasagne.layers.get_all_params(recurrence,trainable=True)

    #adam one-step weight updates
    updates = lasagne.updates.adam(loss,params)


    step = theano.function([prev_phrase,reference_answer],loss,
                                updates=theano.OrderedUpdates(updates)+recurrence.get_automatic_updates()
                                )



Make sure it is always above <unspecified>(n_steps) you specified for recurrence
  "Make sure it is always above {}(n_steps) you specified for recurrence".format(n_steps or "<unspecified>"))


In [23]:
### Use the same decoder to generate tokens.
# Do this by feeding it's output back as it's input
from collections import OrderedDict

class generative:


    recurrent_states = OrderedDict({
            decoder_step.l_gru0:decoder_step.l_prev_gru0,
            decoder_step.next_word:decoder_step.prev_word
        })
    


    recurrence = Recurrence(state_variables=recurrent_states,
                            state_init = {decoder_step.l_gru0 : encoder.output},
                            tracked_outputs=(decoder_step.next_word_probas,decoder_step.next_word),
                            batch_size=batch_size,
                            n_steps = max_len,
                            unroll_scan=unroll_scan,
                            )



    state_seqs, (probas_seq,output_tokens_seq) = recurrence.get_sequence_layers()

    reply_tokens = lasagne.layers.get_output(output_tokens_seq)
    
    apply_fun = theano.function([prev_phrase],reply_tokens,
                                 updates=recurrence.get_automatic_updates())



In [67]:
#If you use separate outputs for context, don't forget about them here
from warnings import warn
def reply(input_phrase,max_len=None):

    input_ix = preproc.phrase_to_ix(input_phrase,max_len=max_len)
    
    reply_ix = generative.apply_fun([input_ix])[0]

    return preproc.ix_to_phrase(reply_ix)


In [73]:
#untrained reply dummy (aka interactive mode)
print reply("What do you think about bidirectional networks?")

can , hey ! - re !


# training

In [69]:
### Get all tuples (context,input phrase, reference answer)
from utils.generate import get_samples_with_context,iterate_minibatches

#preprocess and tokenize all phrases;
convs_ix = preproc.preprocess_conversations(conversations,verbose=True,max_len=max_len)

#get all batches of [phrase context,previous phrase, next phrase]
contexts,prev_phrases,reference_answers = get_samples_with_context(convs_ix,
                                                                   context_window_size=context_size,
                                                                   padder=preproc.token_to_ix["PAD"],
                                                                   speaker_filter = lambda s1,s2: True).swapaxes(0,1)
#cast to int32
contexts,prev_phrases,reference_answers = map(lambda a: np.stack(a).astype('int32'),
                                              (contexts,prev_phrases,reference_answers))


print "Tuple shapes:"
print contexts.shape,prev_phrases.shape,reference_answers.shape

100%|██████████| 7285/7285 [00:09<00:00, 730.05it/s]


Tuple shapes:
(64711, 3, 15) (64711, 15) (64711, 15)


In [70]:
batch_size=32
n_epochs=100

In [71]:
epoch_counter = 1
loss_history = []

In [72]:
for _ in range(n_epochs):
    
    for b_x,b_y in iterate_minibatches([prev_phrases,reference_answers],batch_size,shuffle=True):
    
        loss_history.append(training.step(b_x,b_y))
        
        epoch_counter +=1
        
        if epoch_counter %25==0:
            print epoch_counter,'iterations...'
            
        if epoch_counter %100==0:
            plt.plot(loss_history);plt.show()
            ph =preproc.ix_to_phrase(b_x[0])
            print 'A:', ph
            print 'B:', reply(ph)
            print 'B true:',preproc.ix_to_phrase(b_y[0])

    print "beginning new loop..."
        

25 iterations...
50 iterations...


KeyboardInterrupt: 