In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from theano import *
from lasagne.layers import InputLayer, get_output
import lasagne
import lasagne.layers
import theano.tensor as T
import theano
import numpy as np

import re
import random

Couldn't import dot_parser, loading of dot files will not be possible.


In [3]:
class WikipediaReader(object):
    
    title_rg = re.compile('.*<title>(.*)</title>.*')
    link_rg = re.compile('\[\[([^\]]*)\]\]')
    redirect_rg = re.compile('.*<redirect title="(.*)" />')
    
    def __init__(self, fname):
        self.fname = fname
        
    def read(self):
        current_page = None
        look_for_next_page = True
        page_text = None

        title_rg = self.title_rg
      
        with open(self.fname) as f:
            try:
                while True:
                    line = f.next()
                    if look_for_next_page:
                        if '<page>' not in line:
                            continue
                        else:
                            look_for_next_page = False
                    if '<title>' in line:
                        current_page = title_rg.match(line).group(1)
                    elif '<redirect' in line:
                        redirect_page = self.redirect_rg.match(line).group(1)
                        self.readRedirect(current_page, redirect_page)
                        look_for_next_page = True
                    elif '<text' in line:
                        lines = [ line[line.index('>')+2:] ]
                        while True:
                            line = f.next()
                            if '</text>' in line:
                                lines.append(line[:line.index('</text>')])
                                look_for_next_page = True
                                page_text = '\n'.join(lines)
                                self.readPage(current_page, page_text)
                                break
                            else:
                                lines.append(line)
            except StopIteration as e:
                pass
            
    def getLinkTargets(self, content):
        ret = self.link_rg.findall(content)
        def s(v):
            a = v.split('|')
            pg = a[0].replace(' ', '_').lower()
            txt = a[-1]
            return pg, txt
        return [s(r) for r in ret]
            
    def readPage(self, title, content):
        pass
    
    def readRedirect(self, title, target):
        pass
        


In [4]:
from wordvecs import WordVectors, EmbeddingLayer

#wordvectors = WordVectors(fname='../GoogleNews-vectors-negative300.bin', negvectors=False)
wordvectors = WordVectors(fname='../enwiki-20141208-pages-articles-multistream-links-output5.bin', negvectors=True, sentence_length=200)

In [5]:
len(wordvectors.vectors)

4850513

In [6]:
from wordvecs import EmbeddingLayer

In [16]:
class WikiLinkingExp(WikipediaReader):
    
    wiki_re = [
        (re.compile('&amp;'), '&'),
        (re.compile('&lt;'), '<'),
        (re.compile('&gt;'), '>'),
        (re.compile('<ref[^<]*<\/ref>'), ''),
        (re.compile('<.*?>'), ''),
        (re.compile('\[http:[^\] ]*'), ''),
        (re.compile('\|(thumb|left|right|\d+px)', re.IGNORECASE), ''),
        (re.compile('\[\[image:[^\[\]]*\|', re.IGNORECASE), ''),
        (re.compile('\[\[category:([^|\]]*)[^]]*\]\]', re.IGNORECASE), '\\1'),
        (re.compile('\[\[[a-z\-]*:[^\]]\]\]'), ''),
        (re.compile('\[\[[^\|\]]*\|'), '[['),
        (re.compile('{{[^}]*}}'), ''),
        (re.compile('{[^}]*}'), ''),
        (re.compile('[\[|\]]'), ''),
        (re.compile('&[^;]*;'), ' '),
        (re.compile('[^a-zA-Z0-9 ]'), ''),
        (re.compile('\n+'), ' ')
        # TODO: clean up some remaining issues with parsing the wiki text
    ]
    
    @classmethod
    def _wikiToText(cls, txt):
        txt = txt.lower()
        for r in cls.wiki_re:
            txt = r[0].sub(r[1], txt)
        return txt
    
    run_training = False
    num_words_to_use = 200  # set by the value set in the word vectors
    batch_size = 2000
    num_negative_samples = 1
    num_words_per_conv = 3
    
    def __init__(self, fname, wordvecs=wordvectors):
        super(WikiLinkingExp, self).__init__(fname)
        self.wordvecs = wordvecs
        self.current_batch = []
        self.page_titles = set()
        self.num_words_to_use = self.wordvecs.sentence_length
        
        # do an inital load of the data
        self.read()
        
        self._setup()
        
        self.train_cnt = 0
        self.train_res = []
        
    def _setup(self):
        self.y_batch = T.ivector('y_labels')
        self.x_words_batch = T.imatrix('x_words')
        self.x_links_batch = T.imatrix('y_links')
        
        self.sentence_l = InputLayer((None, self.num_words_to_use), input_var=self.x_words_batch)
        self.link_l = InputLayer((None,1), input_var=self.x_links_batch)
        
        self.embedding_W = theano.shared(self.wordvecs.get_numpy_matrix())
        
        self.sentence_emb_l = EmbeddingLayer(
            self.sentence_l, 
            W=self.embedding_W,
            add_word_params=False,
        )
        
        self.link_emb_l = EmbeddingLayer(
            self.link_l,
            W=self.embedding_W,
            add_word_params=False,
        )
        
        self.sentence_conv_l = lasagne.layers.Conv2DLayer(
            self.sentence_emb_l,
            num_filters=150,
            filter_size=(self.num_words_per_conv, self.wordvecs.vector_size),
            name='conv_sent1',
            nonlinearity=lasagne.nonlinearities.rectify,
        )
        
        self.sentence_pool_l = lasagne.layers.MaxPool2DLayer(
            self.sentence_conv_l,
            name='maxing_sent1',
            pool_size=(self.num_words_to_use - self.num_words_per_conv, 1),
        )
        
        self.combined_l = lasagne.layers.ConcatLayer(
            (self.link_emb_l, self.sentence_pool_l,)
        )
        
        self.dropped_l = lasagne.layers.DropoutLayer(
            self.combined_l,
            p=.25,
        )
        
        self.dense1_l = lasagne.layers.DenseLayer(
            self.dropped_l,
            num_units=100,
            name='dens1',
            nonlinearity=lasagne.nonlinearities.rectify,
        )
        
        self.dropped2_l = lasagne.layers.DropoutLayer(
            self.dense1_l,
            p=.25
        )
        
        self.out_l = lasagne.layers.DenseLayer(
            self.dropped2_l,
            num_units=2,
            name='dens2',
            nonlinearity=lasagne.nonlinearities.rectify,
        )
        
        self.output_vec = lasagne.layers.get_output(self.out_l)
        self.result_vec = self.output_vec[:,0] - self.output_vec[:,1]
        self.loss_vec = T.nnet.binary_crossentropy(T.clip(self.result_vec + .5, .001, .999), self.y_batch)
        self.output_diff = T.neq(self.result_vec > 0, self.y_batch > .5)
        
        self.all_params = lasagne.layers.get_all_params(self.out_l)
        self.updates = lasagne.updates.adagrad(self.loss_vec.mean(), self.all_params, .01)  # TODO: variable learning rate??
        
        self.train_func = theano.function(
            [self.x_words_batch, self.x_links_batch, self.y_batch],
            [self.loss_vec.sum(), self.output_diff.sum(), self.loss_vec.mean(), self.loss_vec],
            updates=self.updates
        )
        
        self.loss_func = theano.function(
            [self.x_words_batch, self.x_links_batch, self.y_batch],
            [self.loss_vec.sum(), self.loss_vec, self.output_diff.sum()]
        )
        
        
    def train(self):
        self.run_training = True
        self.loss_sum = 0.0
        self.diff_sum = 0
        self.sample_cnt = 0
        self.current_batch = []
        
        self.read()
        
        r = self.train_cnt, float(self.loss_sum) / self.sample_cnt, float(self.diff_sum) / self.sample_cnt
        self.train_cnt += 1
        self.train_res.append(r)
        return r
        tr
        
    def readPage(self, title, content):
        # would be nice to use tf-idf here for the words from the document that should look at, but then won't have that much meanning....
        links = self.getLinkTargets(content)
        words = self._wikiToText(content).split()[:self.num_words_to_use]
        wordsv = self.wordvecs.tokenize(words)
        titlev = self.wordvecs.get_location(title)
        linksv = self.wordvecs.tokenize(links)
        if self.run_training:
            for l in linksv:
                self.current_batch.append((titlev, wordsv, l, 1))
            for l in random.sample(self.page_titles, len(linksv)*self.num_negative_samples):
                self.current_batch.append((titlev, wordsv, l, 0))
            
            if len(self.current_batch) >= self.batch_size:
                pass  # TODO:
        else:
            self.page_titles.add(titlev)
        
    def train_batch(self):
        labels = np.array([r[3] for r in self.current_batch]).astype('int32')
        targets = np.array([[r[2]] for r in self.current_batch]).astype('int32')
        words = np.array([r[1] for r in self.current_batch]).astype('int32')
        
        loss_sum, diff_sum, _, _ = self.train_func(words, targets, labels)
        self.loss_sum += loss_sum
        self.diff_sum += diff_sum
        self.sample_cnt += len(self.current_batch)
        self.current_batch = []

        
wikiexp = WikiLinkingExp('../enwiki-test-small.xml')

In [17]:
len(wikiexp.page_titles)

276

In [18]:
wikiexp.train()

ZeroDivisionError: float division by zero

In [19]:
len(wikiexp.current_batch)

110400

In [None]:
np.array([[r[2]] for r in wikiexp.current_batch]).shape

In [None]:
wikiexp.train_batch()