In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from theano import *
from lasagne.layers import EmbeddingLayer, InputLayer, get_output
import lasagne
import lasagne.layers
import theano.tensor as T
import theano
import numpy as np
from helpers import SimpleMaxingLayer

Couldn't import dot_parser, loading of dot files will not be possible.


In [3]:
import json

In [4]:
with open('../external-wiki1.json') as f:
    queries = json.load(f)['queries']

In [5]:
len(queries)

9915

In [6]:
sum([any([g['gold'] for g in v.values()]) for v in queries.values()])

8917

In [7]:
8917/9915.

0.8993444276348966

In [8]:
from wordvecs import WordVectors, EmbeddingLayer

wordvectors = WordVectors(
    fname="../GoogleNews-vectors-negative300.bin",
    negvectors=False
)

In [9]:
with open('../enwiki-20141208-pages-articles-multistream-redirects4.json') as f:
    page_redirects = json.load(f)

In [10]:
from wikireader import WikiRegexes, WikipediaReader

In [57]:
class EntityVectorLinkExp(object):
    
    def __init__(self, wikipedia_dump_fname, wordvec=wordvectors, queries=queries, redirects=page_redirects):
        self.wordvecs = wordvec
        self.queries = queries
        self.sentence_length = self.wordvecs.sentence_length
        self.num_words_to_use_conv = 3
        self.redirects = redirects
        self.page_content = {}
        self.wikipedia_dump_fname = wikipedia_dump_fname
        
        #self._process_queries()
        
        self._setup()
        
    def _process_queries(self):
        queried_pages = set()
        for docs, q in self.queries.iteritems():
            self.wordvecs.tokenize(docs)
            for sur, v in q.iteritems():
                self.wordvecs.tokenize(sur)
                for link in v.keys():
                    self.wordvecs.tokenize(link)
                    queried_pages.add(WikiRegexes.convertToTitle(link))            

        added_pages = set()
        for title in queried_pages:
            if title in self.redirects:
                self.wordvecs.tokenize(self.redirects[title])
                added_pages.add(self.redirects[title])
        queried_pages |= added_pages
                
        class GetWikipediaWords(WikipediaReader, WikiRegexes):
            
            def readPage(ss, title, content):
                if title in queried_pages:
                    cnt = ss._wikiToText(content)
                    self.page_content[title] = self.wordvecs.tokenize(cnt)
        
        GetWikipediaWords(self.wikipedia_dump_fname).read()
               
        
    def _setup(self):
        self.x_document_input = T.imatrix('x_sent')
        self.x_surface_text_input = T.imatrix('x_surface')
        self.x_target_input = T.imatrix('x_target')
        self.x_document_id = T.ivector('x_sent_id')
        self.x_link_id = T.ivector('x_link_id')
        self.y_score = T.vector('y')
        
        self.embedding_W = theano.shared(self.wordvecs.get_numpy_matrix())
        
        self.document_l = lasagne.layers.InputLayer((None,self.sentence_length), input_var=self.x_document_input)
    
        self.document_embedding_l = EmbeddingLayer(
            self.document_l,
            W=self.embedding_W,
            add_word_params=False,
        )
        
        self.document_conv1_l = lasagne.layers.Conv2DLayer(
            self.document_embedding_l,
            num_filters=350,
            filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size),
            name='document_conv1',
            nonlinearity=lasagne.nonlinearities.rectify,
        )
        
        self.document_max_l = lasagne.layers.Pool2DLayer(
            self.document_conv1_l,
            name='document_pool1',
            pool_size=(self.sentence_length - self.num_words_to_use_conv, 1),
            mode='max',
        )

        self.document_dens1 = lasagne.layers.DenseLayer(
            self.document_max_l,
            num_units=300,
            name='doucment_dens1',
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.document_drop1 = lasagne.layers.DropoutLayer(
            self.document_dens1,
            p=.25,
        )
        
        document_output_length = 250
        
        self.document_dens2 = lasagne.layers.DenseLayer(
            self.document_drop1,
            num_units=document_output_length,
            name='document_dens2',
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.document_output = lasagne.layers.get_output(self.document_dens2)
                
        self.surface_input_l = lasagne.layers.InputLayer(
            (None, self.sentence_length), 
            input_var=self.x_surface_text_input
        )
        
        self.surface_embedding_l = EmbeddingLayer(
            self.surface_input_l,
            W=self.embedding_W,
            add_word_params=False,
        )
        
        self.surface_conv1_l = lasagne.layers.Conv2DLayer(
            self.surface_embedding_l,
            num_filters=350,
            filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size),
            name='surface_conv1',
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.surface_dens1 = lasagne.layers.DenseLayer(
            self.surface_conv1_l,
            name='surface_dens1',
            num_units=300,
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.surface_drop1 = lasagne.layers.DropoutLayer(
            self.surface_dens1,
            p=.25,
        )
        
        self.surface_dens2 = lasagne.layers.DenseLayer(
            self.surface_drop1,
            name='surface_dens2',
            num_units=250,
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.document_aligned_l = InputLayer(
            (None, document_output_length),
            input_var=self.document_output[self.x_document_id,:]
        )
        
        self.source_l = lasagne.layers.ConcatLayer(
            [self.document_aligned_l, self.surface_dens2]
        )
        
        self.source_dens1 = lasagne.layers.DenseLayer(
            self.source_l,
            num_units=300,
            name='source_dens1',
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.source_drop1 = lasagne.layers.DropoutLayer(
            self.source_dens1,
            p=.25,
        )
        
        self.source_dens2 = lasagne.layers.DenseLayer(
            self.source_drop1,
            num_units=300,
            name='source_dens2',
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.source_out = lasagne.layers.get_output(self.source_dens2)
        
        self.target_input_l = lasagne.layers.InputLayer(
            (None,self.sentence_length), 
            input_var=self.x_target_input
        )
        
        self.target_embedding_l = EmbeddingLayer(
            self.target_input_l,
            W=self.embedding_W,
            add_word_params=False,
        )
        
        self.target_conv1_l = lasagne.layers.Conv2DLayer(
            self.target_embedding_l,
            name='target_conv1',
            filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size),
            num_filters=300,
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.target_dens1 = lasagne.layers.DenseLayer(
            self.target_conv1_l,
            name='target_dens1',
            num_units=300,
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.target_drop1 = lasagne.layers.DropoutLayer(
            self.target_dens1,
            p=.25,
        )
        
        self.target_dens2 = lasagne.layers.DenseLayer(
            self.target_drop1,
            name='target_dens2',
            num_units=300,
            nonlinearity=lasagne.nonlinearities.tanh,
        )
        
        self.target_out = lasagne.layers.get_output(self.target_dens2)
        
        # compute the cosine distance between the two layers
        self.source_aligned_l = self.source_out[self.x_link_id, :]
        
        self.res_l = T.tensordot(self.target_out, self.source_aligned_l, axes=2) / (self.target_out.norm(2, axis=1)  * self.source_out.norm(2, axis=1))
        
        self.all_params = (
            lasagne.layers.get_all_params(self.target_dens2) + 
            lasagne.layers.get_all_params(self.source_dens2) +
            lasagne.layers.get_all_params(self.document_dens2)
        )
        
        self.loss_vec = T.nnet.binary_crossentropy(T.clip(self.res_l, .001, .999), self.y_score)
        
        self.updates = lasagne.updates.adadelta(self.loss_vec.mean(), self.all_params)
        
        self.train_func = theano.function(
            [self.x_document_input,
             self.x_surface_text_input, self.x_document_id,
             self.x_target_input, self.x_link_id, self.y_score],
            [self.res_l, self.loss_vec.sum(), self.loss_vec],
            updates=self.updates
        )
        
        self.test_func = theano.function(
            [self.x_document_input,
             self.x_surface_text_input, self.x_document_id,
             self.x_target_input, self.x_link_id, self.y_score],
            [self.res_l, self.loss_vec.sum(), self.loss_vec],
        )
        
    def compute_batch(self, isTraining=True):
        if isTraining:
            func = self.train_func
        else:
            func = self.test_func
        self.current_documents = []
        self.current_surface_text = []
        self.current_link_id = []
        self.current_target_input = []
        
        for doc, queries in self.queries.iteritems():
            if queries.values()[0]['training'] is not isTraining:
                continue
            docid = len(self.current_documents)
            self.current_documents.append(self.wordvecs.tokenize(doc))
            for surtxt, targets in queries.iteritems():
                self.current_link_id.append(docid)
                surid = len(self.current_surface_text)
                self.current_surface_text.append(self.wordvecs.tokenize(surtxt))
                for target in targets['vals'].keys():
                    isGold = target == targets['gold']
                    cnt = self.page_content.get(WikiRegexes.convertToTitle(target))
                    
            

queries_exp = EntityVectorLinkExp(
    wikipedia_dump_fname='../enwiki-test-small.xml'
)        

In [78]:
queries_exp.queries.values()

True

In [74]:
queries.values()[0].values()[0]

{u'gold': u'Fred Rutherford',
 u'training': True,
 u'vals': {u'-NIL-': 0,
  u'Alexander Cameron Rutherford': 0,
  u'Ernest Rutherford': 0,
  u'Fred Rutherford': 0,
  u'Jock Rutherford': 0,
  u'John Rutherford (rugby union)': 0,
  u'Johnny Rutherford': 0,
  u'Rutherford': 0,
  u'Rutherford (NJT station)': 0,
  u'Rutherford AVA': 0,
  u'Rutherford County, North Carolina': 0,
  u'Rutherford County, Tennessee': 0,
  u'Rutherford GO Station': 0,
  u'Rutherford, California': 0,
  u'Rutherford, Edmonton': 0,
  u'Rutherford, New Jersey': 0,
  u'Rutherford, New South Wales': 0,
  u'Rutherford, Pennsylvania': 0,
  u'Rutherford, Tennessee': 0,
  u'XXNILXX': 0}}

In [64]:
wordvectors.get_numpy_matrix()[:,0].shape

(43,)

In [53]:
queries_exp.update

OrderedDict([(<TensorType(float64, 4D)>, Elemwise{add,no_inplace}.0),
             (target_conv1.W, Elemwise{sub,no_inplace}.0),
             (<TensorType(float64, 4D)>, Elemwise{add,no_inplace}.0),
             (<TensorType(float64, vector)>, Elemwise{add,no_inplace}.0),
             (target_conv1.b, Elemwise{sub,no_inplace}.0),
             (<TensorType(float64, vector)>, Elemwise{add,no_inplace}.0),
             (<TensorType(float64, matrix)>, Elemwise{add,no_inplace}.0),
             (target_dens1.W, Elemwise{sub,no_inplace}.0),
             (<TensorType(float64, matrix)>, Elemwise{add,no_inplace}.0),
             (<TensorType(float64, vector)>, Elemwise{add,no_inplace}.0),
             (target_dens1.b, Elemwise{sub,no_inplace}.0),
             (<TensorType(float64, vector)>, Elemwise{add,no_inplace}.0),
             (<TensorType(float64, matrix)>, Elemwise{add,no_inplace}.0),
             (target_dens2.W, Elemwise{sub,no_inplace}.0),
             (<TensorType(float64, matrix)>, El

In [46]:
queries_exp.all_params

[target_conv1.W,
 target_conv1.b,
 target_dens1.W,
 target_dens1.b,
 target_dens2.W,
 target_dens2.b,
 surface_conv1.W,
 surface_conv1.b,
 surface_dens1.W,
 surface_dens1.b,
 surface_dens2.W,
 surface_dens2.b,
 source_dens1.W,
 source_dens1.b,
 source_dens2.W,
 source_dens2.b,
 document_conv1.W,
 document_conv1.b,
 doucment_dens1.W,
 doucment_dens1.b,
 document_dens2.W,
 document_dens2.b]