# Bi-directional Recurrent Neural Networks


In [1]:
import sys, os
from numpy import *
from matplotlib.pyplot import *

%matplotlib inline
matplotlib.rcParams['savefig.dpi'] = 100

%load_ext autoreload
%autoreload 2

In [2]:
from brnnlm import BRNNLM
# Gradient check on toy data, for speed
random.seed(10)
wv_dummy = random.randn(10,50)
model = BRNNLM(L0 = wv_dummy, U0 = wv_dummy,
              alpha=0.005)
model.grad_check(array([1,2,3,4]), array([2,3,4]))

grad_check: dJ/dU error norm = 9.6e-10 [ok]
    U dims: [10, 100] = 1000 elem
grad_check: dJ/dRH error norm = 4.689e-10 [ok]
    RH dims: [50, 50] = 2500 elem
grad_check: dJ/dLH error norm = 7.747e-10 [ok]
    LH dims: [50, 50] = 2500 elem


## Prepare Vocabulary and Load PTB Data

We've pre-prepared a list of the vocabulary in the Penn Treebank, along with their absolute counts and unigram frequencies. The document loader code below will "canonicalize" words and replace any unknowns with a `"UUUNKKK"` token, then convert the data to lists of indices.

In [3]:
import data_utils.ner as ner
wvT, word_to_numT, num_to_wordT = ner.load_wv('data/vocabTwitter.txt',
                                           'data/wordVectorTwitter.txt')
wvW, word_to_numW, num_to_wordW = ner.load_wv('data/vocabWiki.txt',
                                              'data/wordVectorWiki.txt')

In [4]:
def wordvector_neighbors(idxs, wordVecs, num_to_word, n=10):
    res_list = []
    for idx in idxs:
        #print square(wordVecs - wordVecs[idx]).shape
        res = argsort(sum(square(wordVecs - wordVecs[idx]), axis=1))[:n+1]
        res_list.append([num_to_word[x] for x in res])
    return res_list

In [5]:
print wordvector_neighbors([word_to_numT['obama']], wvT, num_to_wordT)
print wordvector_neighbors([word_to_numW['obama']], wvW, num_to_wordW)

[['obama', 'romney', 'barack', 'president', 'clinton', 'hillary', 'potus', 'biden', 'says', 'bill', 'bush']]
[['obama', 'barack', 'bush', 'clinton', 'mccain', 'dole', 'gore', 'hillary', 'rodham', 'kerry', 'biden']]


In [6]:
from data_utils import utils as du
import pandas as pd

# Load the vocabulary
vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+",
                     index_col=0, names=['count', 'freq'], )

# Choose how many top words to keep
vocabsize = 8000
        
words = list(set(vocab.index[:vocabsize]).intersection(set(word_to_numW.keys())))
words.append('UUUNKKK')
words.append('<s>')
words.append('</s>')
vocabsize = len(words)
num_to_word = dict(enumerate(words))
word_to_num = du.invert_dict(num_to_word)
##
# Below needed for 'adj_loss': DO NOT CHANGE
fraction_lost = float(sum([vocab['count'][word] for word in vocab.index
                           if (not word in word_to_num) 
                               and (not word == "UUUNKKK")]))
fraction_lost /= sum([vocab['count'][word] for word in vocab.index
                      if (not word == "UUUNKKK")])
print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab),
                                                             100*(1-fraction_lost))

Retained 7861 words from 38444 (91.76% of all tokens)


Load the datasets, using the vocabulary in `word_to_num`. Our starter code handles this for you, and also generates lists of lists X and Y, corresponding to input words and target words*. 

*(Of course, the target words are just the input words, shifted by one position, but it can be cleaner and less error-prone to keep them separate.)*

In [7]:
# Load the training set
docs = du.load_dataset('data/lm/ptb-train.txt')
S_train = du.docs_to_indices(docs, word_to_num)
X_train, Y_train = du.seqs_to_lmXY(S_train)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/lm/ptb-dev.txt')
S_dev = du.docs_to_indices(docs, word_to_num)
X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

# Load the test set (final evaluation only)
docs = du.load_dataset('data/lm/ptb-test.txt')
S_test = du.docs_to_indices(docs, word_to_num)
X_test, Y_test = du.seqs_to_lmXY(S_test)

# Display some sample data
print " ".join(d[0] for d in docs[7])

Big investment banks refused to step up to the plate to support the beleaguered floor traders by buying big blocks of stock , traders say .


In [8]:
print [num_to_word[x] for x in X_train[10]]
print [num_to_word[y] for y in Y_train[10]]

['<s>', 'we', "'re", 'talking', 'about', 'years', 'ago', 'before', 'anyone', 'heard', 'of', 'asbestos', 'having', 'any', 'questionable', 'properties', '.']
['we', "'re", 'talking', 'about', 'years', 'ago', 'before', 'anyone', 'heard', 'of', 'asbestos', 'having', 'any', 'questionable', 'properties', '.', '</s>']


## Train and evaluate your model

When you're able to pass the gradient check, let's run our model on some real language!

You should randomly initialize the word vectors as Gaussian noise, i.e. $W_{ij} \sim \mathit{N}(0,0.1)$; the function `random.randn` may be helpful here.

In [9]:
random.seed(10)
hdim = 50 # dimension of hidden layer = dimension of word vectors
L0 = zeros((vocabsize, hdim))
for i in xrange(vocabsize-3):
    L0[i] = wvW[word_to_numW[num_to_word[i]]]
L0[-3] = random.randn(hdim) * sqrt(0.1) # UUUNKKK
L0[-2] = random.randn(hdim) * sqrt(0.1) # <s>
L0[-1] = random.randn(hdim) * sqrt(0.1) # </s>

In [10]:
#### YOUR CODE HERE ####

##
# Pare down to a smaller dataset, for speed (optional)
ntrain = len(Y_train)
X = X_train[:ntrain]
Y = Y_train[:ntrain]
S = S_train[:ntrain]

def randomSchedule(n):
    i = 0
    while i < n:
        i += 1
        yield random.randint(0, len(Y))
        
def randomMinibatchSchedule(n, k):
    i = 0
    while i < n / k:
        i += 1
        yield [random.randint(0, len(Y)) for _ in xrange(k)]
        
def annealingAlpha(n, alpha, tau):
    for i in xrange(n):
        yield alpha * tau / max(i, tau) 


#### END YOUR CODE ####

In [11]:
print ntrain

56522


In [42]:
## Evaluate cross-entropy loss on the dev set,
## then convert to perplexity for your writeup

#n = ntrain
n = 40000
k = 5
model = BRNNLM(L0, U0 = L0, alpha=0.1)
# train on S not X, Y is ignored
model.train_sgd(X=S, y=Y, idxiter=randomSchedule(n),costevery=2000)

Begin SGD...
  Seen 0 in 0.00 s
  [0]: mean loss 9.85084
  [2000]: mean loss 5.62363
  [4000]: mean loss 5.44422
  [6000]: mean loss 5.35979
  [8000]: mean loss 5.45166
  Seen 10000 in 4901.68 s
  [10000]: mean loss 5.26666
  [12000]: mean loss 5.21326
  [14000]: mean loss 5.23927
  [16000]: mean loss 5.24895
  [18000]: mean loss 5.12286
  Seen 20000 in 9606.30 s
  [20000]: mean loss 5.16385
  [22000]: mean loss 5.08791
  [24000]: mean loss 5.10743
  [26000]: mean loss 5.09146
  [28000]: mean loss 5.06347
  Seen 30000 in 14313.09 s
  [30000]: mean loss 5.13498
  [32000]: mean loss 5.00622
  [34000]: mean loss 5.01644
  [36000]: mean loss 4.99323
  [38000]: mean loss 5.00177
  [40000]: mean loss 5.02402
SGD complete: 40000 examples in 19845.38 seconds.


[(0, 9.8508386909235757),
 (2000, 5.6236279208727389),
 (4000, 5.4442216623425219),
 (6000, 5.3597933901338584),
 (8000, 5.4516570695163953),
 (10000, 5.2666633991930523),
 (12000, 5.2132622515440685),
 (14000, 5.2392709906749495),
 (16000, 5.248954177129594),
 (18000, 5.1228569123401702),
 (20000, 5.1638487274959477),
 (22000, 5.0879067970156058),
 (24000, 5.1074259545429213),
 (26000, 5.0914607249647537),
 (28000, 5.0634699288345901),
 (30000, 5.134978915883905),
 (32000, 5.0062231888372137),
 (34000, 5.0164439650648598),
 (36000, 4.9932292603385955),
 (38000, 5.0017743556265231),
 (40000, 5.0240160779608969)]

In [43]:
# compute loss on S not X, Y is ignored
dev_loss = model.compute_mean_loss(S_dev, Y_dev)
print dev_loss

5.03987517448


### Save Model Parameters

In [12]:
model.save_parameters()

TypeError: save_parameters() takes exactly 1 argument (2 given)

The performance of the model is skewed somewhat by the large number of `UUUNKKK` tokens; if these are 1/6 of the dataset, then that's a sizeable fraction that we're just waving our hands at. Naively, our model gets credit for these that's not really deserved; the formula below roughly removes this contribution from the average loss. Don't worry about how it's derived, but do report both scores - it helps us compare across models with different vocabulary sizes.

In [None]:
## DO NOT CHANGE THIS CELL ##
# Report your numbers, after computing dev_loss above.
def adjust_loss(loss, funk):
    return (loss + funk * log(funk))/(1 - funk)
print "Unadjusted: %.03f" % exp(dev_loss)
print "Adjusted for missing vocab: %.03f" % exp(adjust_loss(dev_loss, fraction_lost))

## Generating Data

Once you've trained your model to satisfaction, let's use it to generate some sentences!

Implement the `generate_sequence` function in `rnnlm.py`, and call it below.

In [14]:
model = BRNNLM(L0, U0 = L0, alpha=0.1)
model.load_parameters('hdim_50_vdim_7861_alpha_01')

In [49]:
model2 = BRNNLM(zeros((8000,100)))
model2.load_parameters('hdim_100_vdim_8000_alpha_01')

In [15]:
def seq_to_words(seq):
    return [num_to_word_all[s] for s in seq]


In [16]:
wv_all = zeros((wvW.shape[0]+3, wvW.shape[1]))
wv_all[range(L0.shape[0])] = L0
num_to_word_all = dict()
for k, v in num_to_word.items():
    num_to_word_all[k] = v
i = L0.shape[0]
for k, v in word_to_numW.items():
    if k not in words:
        wv_all[i] = wvW[v]
        num_to_word_all[i] = k
        i += 1
word_to_num_all = du.invert_dict(num_to_word_all)
   



400003


In [17]:
print len(word_to_num_all)

400003


In [44]:
def fill_missing(before, after, n, word_to_num, wv):
    before_l = []
    for x in before.split():
        if x in word_to_num:
            before_l.append(word_to_num[x])
        else:
            before_l.append(word_to_num['UUUNKKK'])
    after_l = []
    for x in after.split():
        if x in word_to_num:
            after_l.append(word_to_num[x])
        else:
            after_l.append(word_to_num['UUUNKKK'])
    seqs = model.generate_missing_seqs(before_l, after_l, n, wv, nres=10)
    for i in xrange(len(seqs)):
        print " ".join(seq_to_words(seqs[i][0])), seqs[i][1]   

In [57]:
fill_missing("<s> barack obama", "profession politician . </s>", 1, word_to_num_all, wv_all)

<s> barack obama , profession politician . </s> 26.6944394325
<s> barack obama that profession politician . </s> 27.128427522
<s> barack obama UUUNKKK profession politician . </s> 27.3278067863
<s> barack obama is profession politician . </s> 27.3630504739
<s> barack obama a profession politician . </s> 27.516876334
<s> barack obama the profession politician . </s> 27.5978415799
<s> barack obama of profession politician . </s> 27.9931943526
<s> barack obama to profession politician . </s> 28.0022687714
<s> barack obama an profession politician . </s> 28.1298082475
<s> barack obama are profession politician . </s> 28.1502471482


In [50]:
before = [word_to_num[x] for x in ["<s>", "he"]]
after = [word_to_num[x] for x in [  "date", "of", "birth", "DGDG", ".", "</s>"]]
seqs, Js, Ps = model2.generate_rank_missing(before, after, nres=10)
for i in xrange(len(seqs)):
    print " ".join(seq_to_words(seqs[i])), Ps[i]

KeyError: 'DGDG'

**BONUS:** Use the unigram distribution given in the `vocab` table to fill in any `UUUNKKK` tokens in your generated sequences with words that we omitted from the vocabulary. You'll want to use `list(vocab.index)` to get a list of words, and `vocab.freq` to get a list of corresponding frequencies.

In [None]:
# Replace UUUNKKK with a random unigram,
# drawn from vocab that we skipped
from nn.math import MultinomialSampler, multinomial_sample
def fill_unknowns(words):
    #### YOUR CODE HERE ####
    print word_to_num['UUUNKKK']
    for i in range(len(words)):
        if 'UUUNKKK' == words[i]:
            idx = multinomial_sample(vocab.freq.values)
            words[i] = vocab.index[idx]
    #### END YOUR CODE ####
    return words
    
print " ".join(fill_unknowns(seq_to_words(seq)))
