# Bi-directional Recurrent Neural Networks


In [174]:
import sys, os
from numpy import *
from matplotlib.pyplot import *

%matplotlib inline
matplotlib.rcParams['savefig.dpi'] = 100

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [175]:
from brnnlm import BRNNLM
# Gradient check on toy data, for speed
random.seed(10)
wv_dummy = random.randn(10,50)
model = BRNNLM(L0 = wv_dummy, U0 = wv_dummy,
              alpha=0.005)
model.grad_check(array([1,2,3,4]), array([2,3,4]))

grad_check: dJ/dU error norm = 9.6e-10 [ok]
    U dims: [10, 100] = 1000 elem
grad_check: dJ/dRH error norm = 4.689e-10 [ok]
    RH dims: [50, 50] = 2500 elem
grad_check: dJ/dLH error norm = 7.747e-10 [ok]
    LH dims: [50, 50] = 2500 elem
grad_check: dJ/dRL[3] error norm = 2.484e-10 [ok]
    RL[3] dims: [50] = 50 elem
grad_check: dJ/dRL[4] error norm = 3.516e-10 [ok]
    RL[4] dims: [50] = 50 elem
grad_check: dJ/dLL[2] error norm = 3.688e-10 [ok]
    LL[2] dims: [50] = 50 elem
grad_check: dJ/dLL[1] error norm = 4.436e-10 [ok]
    LL[1] dims: [50] = 50 elem


## Prepare Vocabulary and Load PTB Data

We've pre-prepared a list of the vocabulary in the Penn Treebank, along with their absolute counts and unigram frequencies. The document loader code below will "canonicalize" words and replace any unknowns with a `"UUUNKKK"` token, then convert the data to lists of indices.

In [176]:
import data_utils.ner as ner
wvT, word_to_numT, num_to_wordT = ner.load_wv('data/vocabTwitter.txt',
                                           'data/wordVectorTwitter.txt')
wvW, word_to_numW, num_to_wordW = ner.load_wv('data/vocabWiki.txt',
                                              'data/wordVectorWiki.txt')

In [177]:
def wordvector_neighbors(idxs, wordVecs, num_to_word, n=10):
    res_list = []
    for idx in idxs:
        #print square(wordVecs - wordVecs[idx]).shape
        res = argsort(sum(square(wordVecs - wordVecs[idx]), axis=1))[:n+1]
        res_list.append([num_to_word[x] for x in res])
    return res_list

In [178]:
print wordvector_neighbors([word_to_numT['obama']], wvT, num_to_wordT)
print wordvector_neighbors([word_to_numW['obama']], wvW, num_to_wordW)

[['obama', 'romney', 'barack', 'president', 'clinton', 'hillary', 'potus', 'biden', 'says', 'bill', 'bush']]
[['obama', 'barack', 'bush', 'clinton', 'mccain', 'dole', 'gore', 'hillary', 'rodham', 'kerry', 'biden']]


In [180]:
from data_utils import utils as du
import pandas as pd

# Load the vocabulary
vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+",
                     index_col=0, names=['count', 'freq'], )

# Choose how many top words to keep
vocabsize = 8000
        
words = list(set(vocab.index[:vocabsize]).intersection(set(word_to_numW.keys())))
words.append('UUUNKKK')
words.append('<s>')
words.append('</s>')
vocabsize = len(words)
num_to_word = dict(enumerate(words))
word_to_num = du.invert_dict(num_to_word)
##
# Below needed for 'adj_loss': DO NOT CHANGE
fraction_lost = float(sum([vocab['count'][word] for word in vocab.index
                           if (not word in word_to_num) 
                               and (not word == "UUUNKKK")]))
fraction_lost /= sum([vocab['count'][word] for word in vocab.index
                      if (not word == "UUUNKKK")])
print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab),
                                                             100*(1-fraction_lost))

Retained 7861 words from 38444 (91.76% of all tokens)


Load the datasets, using the vocabulary in `word_to_num`. Our starter code handles this for you, and also generates lists of lists X and Y, corresponding to input words and target words*. 

*(Of course, the target words are just the input words, shifted by one position, but it can be cleaner and less error-prone to keep them separate.)*

In [181]:
# Load the training set
docs = du.load_dataset('data/lm/ptb-train.txt')
S_train = du.docs_to_indices(docs, word_to_num)
X_train, Y_train = du.seqs_to_lmXY(S_train)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/lm/ptb-dev.txt')
S_dev = du.docs_to_indices(docs, word_to_num)
X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

# Load the test set (final evaluation only)
docs = du.load_dataset('data/lm/ptb-test.txt')
S_test = du.docs_to_indices(docs, word_to_num)
X_test, Y_test = du.seqs_to_lmXY(S_test)

# Display some sample data
print " ".join(d[0] for d in docs[7])

Big investment banks refused to step up to the plate to support the beleaguered floor traders by buying big blocks of stock , traders say .


In [182]:
print [num_to_word[x] for x in X_train[10]]
print [num_to_word[y] for y in Y_train[10]]

['<s>', 'we', "'re", 'talking', 'about', 'years', 'ago', 'before', 'anyone', 'heard', 'of', 'asbestos', 'having', 'any', 'questionable', 'properties', '.']
['we', "'re", 'talking', 'about', 'years', 'ago', 'before', 'anyone', 'heard', 'of', 'asbestos', 'having', 'any', 'questionable', 'properties', '.', '</s>']


## Train and evaluate your model

When you're able to pass the gradient check, let's run our model on some real language!

You should randomly initialize the word vectors as Gaussian noise, i.e. $W_{ij} \sim \mathit{N}(0,0.1)$; the function `random.randn` may be helpful here.

In [9]:
random.seed(10)
hdim = 50 # dimension of hidden layer = dimension of word vectors
L0 = zeros((vocabsize, hdim))
for i in xrange(vocabsize-3):
    L0[i] = wvW[word_to_numW[num_to_word[i]]]
L0[-3] = random.randn(hdim) * sqrt(0.1) # UUUNKKK
L0[-2] = random.randn(hdim) * sqrt(0.1) # <s>
L0[-1] = random.randn(hdim) * sqrt(0.1) # </s>

In [10]:
#### YOUR CODE HERE ####

##
# Pare down to a smaller dataset, for speed (optional)
ntrain = len(Y_train)
X = X_train[:ntrain]
Y = Y_train[:ntrain]
S = S_train[:ntrain]

def randomSchedule(n):
    i = 0
    while i < n:
        i += 1
        yield random.randint(0, len(Y))
        
def randomMinibatchSchedule(n, k):
    i = 0
    while i < n / k:
        i += 1
        yield [random.randint(0, len(Y)) for _ in xrange(k)]
        
def annealingAlpha(n, alpha, tau):
    for i in xrange(n):
        yield alpha * tau / max(i, tau) 


#### END YOUR CODE ####

In [11]:
print ntrain

56522


In [42]:
## Evaluate cross-entropy loss on the dev set,
## then convert to perplexity for your writeup

#n = ntrain
n = 40000
k = 5
model = BRNNLM(L0, U0 = L0, alpha=0.1)
# train on S not X, Y is ignored
model.train_sgd(X=S, y=Y, idxiter=randomSchedule(n),costevery=2000)

Begin SGD...
  Seen 0 in 0.00 s
  [0]: mean loss 9.85084
  [2000]: mean loss 5.62363
  [4000]: mean loss 5.44422
  [6000]: mean loss 5.35979
  [8000]: mean loss 5.45166
  Seen 10000 in 4901.68 s
  [10000]: mean loss 5.26666
  [12000]: mean loss 5.21326
  [14000]: mean loss 5.23927
  [16000]: mean loss 5.24895
  [18000]: mean loss 5.12286
  Seen 20000 in 9606.30 s
  [20000]: mean loss 5.16385
  [22000]: mean loss 5.08791
  [24000]: mean loss 5.10743
  [26000]: mean loss 5.09146
  [28000]: mean loss 5.06347
  Seen 30000 in 14313.09 s
  [30000]: mean loss 5.13498
  [32000]: mean loss 5.00622
  [34000]: mean loss 5.01644
  [36000]: mean loss 4.99323
  [38000]: mean loss 5.00177
  [40000]: mean loss 5.02402
SGD complete: 40000 examples in 19845.38 seconds.


[(0, 9.8508386909235757),
 (2000, 5.6236279208727389),
 (4000, 5.4442216623425219),
 (6000, 5.3597933901338584),
 (8000, 5.4516570695163953),
 (10000, 5.2666633991930523),
 (12000, 5.2132622515440685),
 (14000, 5.2392709906749495),
 (16000, 5.248954177129594),
 (18000, 5.1228569123401702),
 (20000, 5.1638487274959477),
 (22000, 5.0879067970156058),
 (24000, 5.1074259545429213),
 (26000, 5.0914607249647537),
 (28000, 5.0634699288345901),
 (30000, 5.134978915883905),
 (32000, 5.0062231888372137),
 (34000, 5.0164439650648598),
 (36000, 4.9932292603385955),
 (38000, 5.0017743556265231),
 (40000, 5.0240160779608969)]

In [59]:
# compute loss on S not X, Y is ignored
dev_loss = model.compute_mean_loss(S_dev, Y_dev)
print dev_loss

5.03987517448


### Save Model Parameters

In [12]:
model.load_parameters()

TypeError: save_parameters() takes exactly 1 argument (2 given)

The performance of the model is skewed somewhat by the large number of `UUUNKKK` tokens; if these are 1/6 of the dataset, then that's a sizeable fraction that we're just waving our hands at. Naively, our model gets credit for these that's not really deserved; the formula below roughly removes this contribution from the average loss. Don't worry about how it's derived, but do report both scores - it helps us compare across models with different vocabulary sizes.

In [58]:
## DO NOT CHANGE THIS CELL ##
# Report your numbers, after computing dev_loss above.
def adjust_loss(loss, funk):
    return (loss + funk * log(funk))/(1 - funk)
print "Unadjusted: %.03f" % exp(dev_loss)
print "Adjusted for missing vocab: %.03f" % exp(adjust_loss(dev_loss, fraction_lost))

NameError: name 'dev_loss' is not defined

## Generating Data

Once you've trained your model to satisfaction, let's use it to generate some sentences!

Implement the `generate_sequence` function in `rnnlm.py`, and call it below.

##Model 1

In [185]:
model = BRNNLM(L0, alpha=0.1)
model.load_parameters('hdim_50_vdim_7861_alpha_01')

In [186]:
wv_all = zeros((wvW.shape[0]+3, wvW.shape[1]))
wv_all[range(L0.shape[0])] = L0
num_to_word_all = dict()
for k, v in num_to_word.items():
    num_to_word_all[k] = v
i = L0.shape[0]
for k, v in word_to_numW.items():
    if k not in words:
        wv_all[i] = wvW[v]
        num_to_word_all[i] = k
        i += 1
word_to_num_all = du.invert_dict(num_to_word_all)
   



In [187]:
print len(word_to_num_all)

400003


In [188]:
def seq_to_words(seq, num_to_word):
    return [num_to_word[s] for s in seq]

def break_str(st, word_to_num):
    st_l = []
    for x in st.split():
        x = word_to_num[du.canonicalize_word(x, word_to_num.keys())]
        st_l.append(x)
    return st_l

In [215]:
def fill_all_missing(model, arg1, relations, arg2, ln, rn, word_to_num, num_to_word, wv):
    k = 10
    arg1 = [word_to_num['<s>']] + break_str(arg1, word_to_num)
    arg2 = break_str(arg2, word_to_num) + [word_to_num['.'], word_to_num['</s>']]
    seqs = []
    for relation in relations:
        relation = break_str(relation, word_to_num)
        # first fill left
        l_missings = model.generate_missing_multiple(arg1, relation + arg2, ln, wv, k)
        #print [([num_to_word[y] for y in x], loss) for x,loss in l_missings[:20]]
        l_missings = l_missings[:k]
        for l_missing,_ in l_missings:
            # then fill right
            r_missings = model.generate_missing_multiple(arg1 + list(l_missing) + relation, arg2, rn, wv, k)[:k]
            for r_missing, loss in r_missings:
                seqs.append((arg1 + list(l_missing) + relation + list(r_missing) + arg2, loss))
    seqs = sorted(seqs, key=lambda x:x[1])
    for i in xrange(len(seqs)):
        print " ".join(seq_to_words(seqs[i][0], num_to_word)), seqs[i][1] 
    return seqs

In [198]:
def fill_middle_missing(model, before, after, n, word_to_num, num_to_word, wv):
    before_l = []
    for x in before.split():
        if x in word_to_num:
            before_l.append(word_to_num[x])
        else:
            before_l.append(word_to_num['UUUNKKK'])
    after_l = []
    for x in after.split():
        if x in word_to_num:
            after_l.append(word_to_num[x])
        else:
            after_l.append(word_to_num['UUUNKKK'])
    seqs = model.generate_missing_seq(before_l, after_l, n, wv, nres=10)
    for i in xrange(len(seqs)):
        print " ".join(seq_to_words(seqs[i][0], num_to_word)), seqs[i][1]   

In [144]:
fill_middle_missing(model, "<s> he is board member ", " google  . </s>", 1, word_to_num_all, num_to_word_all, wv_all)

10
<s> he is board member eligible google . </s> 1.08677725393
<s> he is board member yellow google . </s> 1.19912798148
<s> he is board member del. google . </s> 1.9966108979
<s> he is board member lord google . </s> 2.38513189275
<s> he is board member woods google . </s> 2.9653172539
<s> he is board member electricity google . </s> 3.37483192138
<s> he is board member looking google . </s> 4.27188708251
<s> he is board member four google . </s> 4.2800082285
<s> he is board member mirage google . </s> 4.30820036877
<s> he is board member refunding google . </s> 5.05993353938


##Model 2

In [190]:
model_2 = BRNNLM(zeros((8000,100)))
model_2.load_parameters('hdim_100_vdim_8000_alpha_01')
num_to_word_2 = dict(enumerate(vocab.index[:8000]))
word_to_num_2 = du.invert_dict(num_to_word_2)

In [191]:
print 'DGDG' in word_to_num_2
print word_to_num_2['UUUNKKK']

True
3


In [199]:
fill_missing(model_2, "<s> mike 's sex", " male  . </s>", 1, word_to_num_2, num_to_word_2, None)

10
<s> mike 's sex UUUNKKK male . </s> 36.3311222003
<s> mike 's sex and male . </s> 38.2982990656
<s> mike 's sex trading male . </s> 38.5440032319
<s> mike 's sex , male . </s> 38.6791087853
<s> mike 's sex market male . </s> 38.8292844798
<s> mike 's sex is male . </s> 38.9581396348
<s> mike 's sex were male . </s> 39.0348792584
<s> mike 's sex 's male . </s> 39.2252363724
<s> mike 's sex of male . </s> 40.2067953462
<s> mike 's sex to male . </s> 40.3826557568


In [219]:
fill_all_missing(model_2, "mike", ["board member", "member"], "china", 2, 1, word_to_num_2, num_to_word_2, None)

<s> mike a big board member of china . </s> 2.91367675336
<s> mike UUUNKKK a member of china . </s> 2.99800406396
<s> mike 's big board member of china . </s> 3.05746428863
<s> mike the big board member of china . </s> 3.05999587453
<s> mike an big board member of china . </s> 3.06569615678
<s> mike , a member of china . </s> 3.07849822462
<s> mike : a member of china . </s> 3.28823292675
<s> mike will be member of china . </s> 3.3009778277
<s> mike a big board member in china . </s> 3.30166948928
<s> mike 's big board member to china . </s> 3.31092694462
<s> mike a big board member to china . </s> 3.33028121748
<s> mike 's big board member in china . </s> 3.34637006963
<s> mike 's big board member by china . </s> 3.35395727484
<s> mike 's big board member for china . </s> 3.35532026058
<s> mike a big board member by china . </s> 3.35863194961
<s> mike a big board member for china . </s> 3.35887618947
<s> mike the big board member in china . </s> 3.36367740837
<s> mike the big board me

[([4, 5147, 8, 147, 161, 1123, 6, 665, 2, 5], 2.9136767533560239),
 ([4, 5147, 3, 8, 1123, 6, 665, 2, 5], 2.9980040639607757),
 ([4, 5147, 11, 147, 161, 1123, 6, 665, 2, 5], 3.0574642886286671),
 ([4, 5147, 1, 147, 161, 1123, 6, 665, 2, 5], 3.0599958745276825),
 ([4, 5147, 36, 147, 161, 1123, 6, 665, 2, 5], 3.0656961567828351),
 ([4, 5147, 0, 8, 1123, 6, 665, 2, 5], 3.0784982246217529),
 ([4, 5147, 75, 8, 1123, 6, 665, 2, 5], 3.2882329267546577),
 ([4, 5147, 40, 31, 1123, 6, 665, 2, 5], 3.3009778276964763),
 ([4, 5147, 8, 147, 161, 1123, 9, 665, 2, 5], 3.301669489278205),
 ([4, 5147, 11, 147, 161, 1123, 7, 665, 2, 5], 3.3109269446196641),
 ([4, 5147, 8, 147, 161, 1123, 7, 665, 2, 5], 3.3302812174818981),
 ([4, 5147, 11, 147, 161, 1123, 9, 665, 2, 5], 3.3463700696282133),
 ([4, 5147, 11, 147, 161, 1123, 24, 665, 2, 5], 3.3539572748370263),
 ([4, 5147, 11, 147, 161, 1123, 13, 665, 2, 5], 3.355320260579771),
 ([4, 5147, 8, 147, 161, 1123, 24, 665, 2, 5], 3.358631949608319),
 ([4, 5147, 8,

In [158]:
from freebase_ie import *
ppdb = construct_ppdb_from_file("ppdb-1.0-s-all")

Loading paraphrases from ppdb-1.0-s-all
6977679 paraphrases added to PPDB


In [None]:
tuples =extract_freebase_tuples(query=None, used_ppdb=True, ppdb=ppdb)
print_tuples(tuples)

## Model 3

In [221]:
model_3 = BRNNLM(zeros((8000,50)), alpha=0.1)
model_3.load_parameters('hdim_50_vdim_8000_alpha_01')
num_to_word_3 = dict(enumerate(vocab.index[:8000]))
word_to_num_3 = du.invert_dict(num_to_word_3)

In [223]:
vocab_size = len(set(word_to_numW.keys()).union(set(word_to_num_3.keys())))
wv_all_3 = zeros((vocab_size, 50))
num_to_word_all_3 = dict()
for k, v in num_to_word_3.items():
    num_to_word_all_3[k] = v
i = len(num_to_word_all_3)
words_3 = set(word_to_num_3.keys())
for k, v in word_to_numW.items():
    if k not in words_3:
        wv_all_3[i] = wvW[v]
        num_to_word_all_3[i] = k
        i += 1
word_to_num_all_3 = du.invert_dict(num_to_word_all_3)
   

In [229]:
fill_all_missing(model_3, "he", ["board member", "member"], "google", 2, 1, word_to_num_all_3, num_to_word_all_3, wv_all_3)

<s> he said the board member of google . </s> 1.89340548848
<s> he says the board member of google . </s> 1.91075369679
<s> he 's big board member of google . </s> 1.95646407038
<s> he said its board member of google . </s> 2.04674302819
<s> he UUUNKKK the board member of google . </s> 2.1693525883
<s> he says its board member of google . </s> 2.22608473676
<s> he said the board member for google . </s> 2.24257550004
<s> he says the board member for google . </s> 2.263218429
<s> he said the board member and google . </s> 2.26952457958
<s> he says the board member and google . </s> 2.2869624332
<s> he 's big board member for google . </s> 2.29579814941
<s> he 's big board member and google . </s> 2.31138228815
<s> he the big board member of google . </s> 2.31750028891
<s> he 's the board member of google . </s> 2.33902203321
<s> he said the board member UUUNKKK google . </s> 2.34569804215
<s> he said its board member for google . </s> 2.35456744709
<s> he said the board member on google

[([4, 34, 19, 1, 161, 1123, 6, 296616, 2, 5], 1.8934054884805083),
 ([4, 34, 56, 1, 161, 1123, 6, 296616, 2, 5], 1.9107536967883494),
 ([4, 34, 11, 147, 161, 1123, 6, 296616, 2, 5], 1.9564640703775436),
 ([4, 34, 19, 32, 161, 1123, 6, 296616, 2, 5], 2.0467430281942769),
 ([4, 34, 3, 1, 161, 1123, 6, 296616, 2, 5], 2.1693525883031453),
 ([4, 34, 56, 32, 161, 1123, 6, 296616, 2, 5], 2.2260847367649372),
 ([4, 34, 19, 1, 161, 1123, 13, 296616, 2, 5], 2.2425755000355383),
 ([4, 34, 56, 1, 161, 1123, 13, 296616, 2, 5], 2.2632184290006645),
 ([4, 34, 19, 1, 161, 1123, 10, 296616, 2, 5], 2.2695245795818182),
 ([4, 34, 56, 1, 161, 1123, 10, 296616, 2, 5], 2.2869624332029921),
 ([4, 34, 11, 147, 161, 1123, 13, 296616, 2, 5], 2.2957981494131561),
 ([4, 34, 11, 147, 161, 1123, 10, 296616, 2, 5], 2.3113822881458099),
 ([4, 34, 1, 147, 161, 1123, 6, 296616, 2, 5], 2.3175002889128824),
 ([4, 34, 11, 1, 161, 1123, 6, 296616, 2, 5], 2.3390220332109615),
 ([4, 34, 19, 1, 161, 1123, 3, 296616, 2, 5], 2.

**BONUS:** Use the unigram distribution given in the `vocab` table to fill in any `UUUNKKK` tokens in your generated sequences with words that we omitted from the vocabulary. You'll want to use `list(vocab.index)` to get a list of words, and `vocab.freq` to get a list of corresponding frequencies.

In [None]:
# Replace UUUNKKK with a random unigram,
# drawn from vocab that we skipped
from nn.math import MultinomialSampler, multinomial_sample
def fill_unknowns(words):
    #### YOUR CODE HERE ####
    print word_to_num['UUUNKKK']
    for i in range(len(words)):
        if 'UUUNKKK' == words[i]:
            idx = multinomial_sample(vocab.freq.values)
            words[i] = vocab.index[idx]
    #### END YOUR CODE ####
    return words
    
print " ".join(fill_unknowns(seq_to_words(seq)))
