# Bi-directional Recurrent Neural Networks


In [4]:
import sys, os
from numpy import *
from matplotlib.pyplot import *
%matplotlib inline
matplotlib.rcParams['savefig.dpi'] = 100

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from brnnlm import BRNNLM
# Gradient check on toy data, for speed
random.seed(10)
wv_dummy = random.randn(10,50)
model = BRNNLM(L0 = wv_dummy, U0 = wv_dummy,
              alpha=0.005)
model.grad_check(array([1,2,3,4]), array([2,3,4]))

grad_check: dJ/dU error norm = 7.462e-10 [ok]
    U dims: [10, 100] = 1000 elem
grad_check: dJ/dRH error norm = 3.401e-10 [ok]
    RH dims: [50, 50] = 2500 elem
grad_check: dJ/dLH error norm = 3.58e-10 [ok]
    LH dims: [50, 50] = 2500 elem
grad_check: dJ/dRL[3] error norm = 3.151e-10 [ok]
    RL[3] dims: [50] = 50 elem
grad_check: dJ/dRL[4] error norm = 4.649e-10 [ok]
    RL[4] dims: [50] = 50 elem
grad_check: dJ/dLL[2] error norm = 3.063e-10 [ok]
    LL[2] dims: [50] = 50 elem
grad_check: dJ/dLL[1] error norm = 4.79e-10 [ok]
    LL[1] dims: [50] = 50 elem


## Prepare Vocabulary and Load PTB Data

We've pre-prepared a list of the vocabulary in the Penn Treebank, along with their absolute counts and unigram frequencies. The document loader code below will "canonicalize" words and replace any unknowns with a `"UUUNKKK"` token, then convert the data to lists of indices.

In [6]:
from data_utils import utils as du
import pandas as pd

# Load the vocabulary
vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+",
                     index_col=0, names=['count', 'freq'], )

# Choose how many top words to keep
vocabsize = 8000
num_to_word = dict(enumerate(vocab.index[:vocabsize]))
word_to_num = du.invert_dict(num_to_word)
##
# Below needed for 'adj_loss': DO NOT CHANGE
fraction_lost = float(sum([vocab['count'][word] for word in vocab.index
                           if (not word in word_to_num) 
                               and (not word == "UUUNKKK")]))
fraction_lost /= sum([vocab['count'][word] for word in vocab.index
                      if (not word == "UUUNKKK")])
print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab),
                                                             100*(1-fraction_lost))

Retained 8000 words from 38444 (94.54% of all tokens)


In [7]:
word_to_num['birth']

6737

Load the datasets, using the vocabulary in `word_to_num`. Our starter code handles this for you, and also generates lists of lists X and Y, corresponding to input words and target words*. 

*(Of course, the target words are just the input words, shifted by one position, but it can be cleaner and less error-prone to keep them separate.)*

In [8]:
# Load the training set
docs = du.load_dataset('data/lm/ptb-train.txt')
S_train = du.docs_to_indices(docs, word_to_num)
X_train, Y_train = du.seqs_to_lmXY(S_train)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/lm/ptb-dev.txt')
S_dev = du.docs_to_indices(docs, word_to_num)
X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

# Load the test set (final evaluation only)
docs = du.load_dataset('data/lm/ptb-test.txt')
S_test = du.docs_to_indices(docs, word_to_num)
X_test, Y_test = du.seqs_to_lmXY(S_test)

# Display some sample data
print " ".join(d[0] for d in docs[7])

Big investment banks refused to step up to the plate to support the beleaguered floor traders by buying big blocks of stock , traders say .


In [9]:
print [num_to_word[x] for x in X_train[10]]
print [num_to_word[y] for y in Y_train[10]]

['<s>', 'we', "'re", 'talking', 'about', 'years', 'ago', 'before', 'anyone', 'heard', 'of', 'asbestos', 'having', 'any', 'questionable', 'properties', '.']
['we', "'re", 'talking', 'about', 'years', 'ago', 'before', 'anyone', 'heard', 'of', 'asbestos', 'having', 'any', 'questionable', 'properties', '.', '</s>']


## Train and evaluate your model

When you're able to pass the gradient check, let's run our model on some real language!

You should randomly initialize the word vectors as Gaussian noise, i.e. $W_{ij} \sim \mathit{N}(0,0.1)$; the function `random.randn` may be helpful here.

In [10]:
hdim = 100 # dimension of hidden layer = dimension of word vectors
L0 = zeros((vocabsize, hdim)) # replace with random init, 
                              # or do in RNNLM.__init__()

In [11]:
#### YOUR CODE HERE ####

##
# Pare down to a smaller dataset, for speed (optional)
ntrain = len(Y_train)
X = X_train[:ntrain]
Y = Y_train[:ntrain]
S = S_train[:ntrain]

def randomSchedule(n):
    i = 0
    while i < n:
        i += 1
        yield random.randint(0, len(Y))
        
def randomMinibatchSchedule(n, k):
    i = 0
    while i < n / k:
        i += 1
        yield [random.randint(0, len(Y)) for _ in xrange(k)]
        
def annealingAlpha(n, alpha, tau):
    for i in xrange(n):
        yield alpha * tau / max(i, tau) 


#### END YOUR CODE ####

In [12]:
print ntrain

56522


In [13]:
## Evaluate cross-entropy loss on the dev set,
## then convert to perplexity for your writeup

#n = ntrain
n = 40000
k = 5
model = BRNNLM(L0, U0 = L0, alpha=0.1)
# train on S not X, Y is ignored
model.train_sgd(X=S, y=Y, idxiter=randomSchedule(n),costevery=2000)

Begin SGD...
  Seen 0 in 0.00 s
SGD Interrupted: saw 0 examples in 48.25 seconds.


[]

In [14]:
# compute loss on S not X, Y is ignored
dev_loss = model.compute_mean_loss(S_dev, Y_dev)
print dev_loss

KeyboardInterrupt: 

### Save Model Parameters

In [None]:
model.save_parameters()

The performance of the model is skewed somewhat by the large number of `UUUNKKK` tokens; if these are 1/6 of the dataset, then that's a sizeable fraction that we're just waving our hands at. Naively, our model gets credit for these that's not really deserved; the formula below roughly removes this contribution from the average loss. Don't worry about how it's derived, but do report both scores - it helps us compare across models with different vocabulary sizes.

In [None]:
## DO NOT CHANGE THIS CELL ##
# Report your numbers, after computing dev_loss above.
def adjust_loss(loss, funk):
    return (loss + funk * log(funk))/(1 - funk)
print "Unadjusted: %.03f" % exp(dev_loss)
print "Adjusted for missing vocab: %.03f" % exp(adjust_loss(dev_loss, fraction_lost))

## Generating Data

Once you've trained your model to satisfaction, let's use it to generate some sentences!

Implement the `generate_sequence` function in `rnnlm.py`, and call it below.

In [15]:
def seq_to_words(seq):
    return [num_to_word[s] for s in seq]


In [16]:
from brnnlm import BRNNLM
model2 = BRNNLM(L0)
model2.load_parameters('hdim_100_vdim_8000_alpha_01')


In [33]:
before = [word_to_num[x] for x in ["<s>", "he"]]
after = [word_to_num[x] for x in [ "the", "president", "of", "united", "states", ".", "</s>"]]
seqs, Js, Ps = model2.generate_rank_missing(before, after, nres=5)
for i in xrange(len(seqs)):
    print " ".join(seq_to_words(seqs[i])), Ps[i]

<s> he UUUNKKK the president of united states . </s> 0.222886916972
<s> he is the president of united states . </s> 0.0587065569618
<s> he had the president of united states . </s> 0.0483964430696
<s> he said the president of united states . </s> 0.0477455269321
<s> he has the president of united states . </s> 0.0437658615277
<s> he was the president of united states . </s> 0.0391759877154
<s> he believes the president of united states . </s> 0.027727599905
<s> he says the president of united states . </s> 0.0145754680211
<s> he includes the president of united states . </s> 0.0141082533531
<s> he expects the president of united states . </s> 0.012693762056


**BONUS:** Use the unigram distribution given in the `vocab` table to fill in any `UUUNKKK` tokens in your generated sequences with words that we omitted from the vocabulary. You'll want to use `list(vocab.index)` to get a list of words, and `vocab.freq` to get a list of corresponding frequencies.

In [None]:
# Replace UUUNKKK with a random unigram,
# drawn from vocab that we skipped
from nn.math import MultinomialSampler, multinomial_sample
def fill_unknowns(words):
    #### YOUR CODE HERE ####
    print word_to_num['UUUNKKK']
    for i in range(len(words)):
        if 'UUUNKKK' == words[i]:
            idx = multinomial_sample(vocab.freq.values)
            words[i] = vocab.index[idx]
    #### END YOUR CODE ####
    return words
    
print " ".join(fill_unknowns(seq_to_words(seq)))
