In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from theano import *
from lasagne.layers import InputLayer, get_output
import lasagne
import lasagne.layers
import theano.tensor as T
import theano
import numpy as np

import re
import random

Couldn't import dot_parser, loading of dot files will not be possible.


In [3]:
from wikireader import WikipediaReader, WikiRegexes

In [4]:
from wordvecs import WordVectors, EmbeddingLayer

#wordvectors = WordVectors(fname='../GoogleNews-vectors-negative300.bin', negvectors=False)
wordvectors = WordVectors(
    fname='/data/matthew/enwiki-20141208-pages-articles-multistream-links4-output1.bin',
    redir_fname='/data/matthew/enwiki-20141208-pages-articles-multistream-redirects4.json',
    negvectors=True,
    sentence_length=200
)

In [5]:
len(wordvectors.vectors)

5007247

In [6]:
org_wvectors = set(wordvectors.vectors.keys())

In [7]:
len(wordvectors.redirects)

6307754

In [8]:
class WikiLinkingExp(WikipediaReader, WikiRegexes):
    
    run_training = False
    num_words_to_use = 200  # set by the value set in the word vectors
    batch_size = 20000
    num_negative_samples = 1
    num_words_per_conv = 3
    
    def __init__(self, fname, wordvecs=wordvectors):
        super(WikiLinkingExp, self).__init__(fname)
    
        self.redirects = wordvecs.redirects
        #self.page_titles = set(wordvecs.vectors.keys())
    
        self.wordvecs = wordvecs
        self.current_batch = []
        self.page_titles = set()
        self.num_words_to_use = self.wordvecs.sentence_length
        
        # do an inital load of the data
        self.read()
        
        self._setup()
        
        self.train_cnt = 0
        self.train_res = []
        
    def _setup(self):
        self.y_batch = T.ivector('y_labels')
        self.x_words_batch = T.imatrix('x_words')
        self.x_links_batch = T.imatrix('y_links')
        
        self.sentence_l = InputLayer((None, self.num_words_to_use), input_var=self.x_words_batch)
        self.link_l = InputLayer((None,1), input_var=self.x_links_batch)
        
        self.embedding_W = theano.shared(self.wordvecs.get_numpy_matrix())
        
        self.sentence_emb_l = EmbeddingLayer(
            self.sentence_l, 
            W=self.embedding_W,
            add_word_params=False,
        )
        
        self.link_emb_l = EmbeddingLayer(
            self.link_l,
            W=self.embedding_W,
            add_word_params=False,
        )
        
        self.sentence_conv_l = lasagne.layers.Conv2DLayer(
            self.sentence_emb_l,
            num_filters=150,
            filter_size=(self.num_words_per_conv, self.wordvecs.vector_size),
            name='conv_sent1',
            nonlinearity=lasagne.nonlinearities.rectify,
        )
        
        self.sentence_pool_l = lasagne.layers.MaxPool2DLayer(
            self.sentence_conv_l,
            name='maxing_sent1',
            pool_size=(self.num_words_to_use - self.num_words_per_conv, 1),
        )
        
        self.combined_l = lasagne.layers.ConcatLayer(
            (lasagne.layers.FlattenLayer(self.link_emb_l), lasagne.layers.FlattenLayer(self.sentence_pool_l),)
        )
        
        self.dropped_l = lasagne.layers.DropoutLayer(
            self.combined_l,
            p=.25,
        )
        
        self.dense1_l = lasagne.layers.DenseLayer(
            self.dropped_l,
            num_units=100,
            name='dens1',
            nonlinearity=lasagne.nonlinearities.rectify,
        )
        
        self.dropped2_l = lasagne.layers.DropoutLayer(
            self.dense1_l,
            p=.25
        )
        
        self.out_l = lasagne.layers.DenseLayer(
            self.dropped2_l,
            num_units=2,
            name='dens2',
            nonlinearity=lasagne.nonlinearities.rectify,
        )
        
        self.output_vec = lasagne.layers.get_output(self.out_l)
        self.result_vec = self.output_vec[:,0] - self.output_vec[:,1]
        self.loss_vec = T.nnet.binary_crossentropy(T.clip(self.result_vec + .5, .001, .999), self.y_batch)
        self.output_diff = T.neq(self.result_vec > 0, self.y_batch > .5)
        
        self.all_params = lasagne.layers.get_all_params(self.out_l)
        self.updates = lasagne.updates.adagrad(self.loss_vec.mean(), self.all_params, .01)  # TODO: variable learning rate??
        
        self.train_func = theano.function(
            [self.x_words_batch, self.x_links_batch, self.y_batch],
            [self.loss_vec.sum(), self.output_diff.sum(), self.loss_vec.mean(), self.loss_vec],
            updates=self.updates
        )
        
        self.loss_func = theano.function(
            [self.x_words_batch, self.x_links_batch, self.y_batch],
            [self.loss_vec.sum(), self.loss_vec, self.output_diff.sum()]
        )
        
        
    def train(self):
        self.run_training = True
        self.loss_sum = 0.0
        self.diff_sum = 0
        self.sample_cnt = 0
        self.current_batch = []
        
        self.read()
        if len(self.current_batch) > 0:
            self.train_batch()
        
        r = self.train_cnt, float(self.loss_sum) / self.sample_cnt, float(self.diff_sum) / self.sample_cnt
        self.train_cnt += 1
        self.train_res.append(r)
        return r
        
    def readPage(self, title, content):
        # would be nice to use tf-idf here for the words from the document that should look at, but then won't have that much meanning....
        links = [r[0] for r in self.getLinkTargets(content)]
        words = self._wikiToText(content).split()[:self.num_words_to_use]
        self.d_words = words
        self.d_links = links
        wordsv = self.wordvecs.tokenize(words)
        self.d_wordsv = wordsv
        titlev = self.wordvecs.get_location(title)
        self.d_titlev = titlev
        linksv = self.wordvecs.tokenize(links)
        if self.run_training:
            for l in linksv:
                self.current_batch.append((titlev, wordsv, l, 1))
            for l in random.sample(self.page_titles, len(linksv)*self.num_negative_samples):
                self.current_batch.append((titlev, wordsv, l, 0))
            
            if len(self.current_batch) >= self.batch_size:
                self.train_batch()
        else:
            self.page_titles.add(titlev)
        
    def train_batch(self):
        labels = np.array([r[3] for r in self.current_batch]).astype('int32')
        targets = np.array([[r[2]] for r in self.current_batch]).astype('int32')
        words = np.array([r[1] for r in self.current_batch]).astype('int32')
        
        loss_sum, diff_sum, _, _ = self.train_func(words, targets, labels)
        self.loss_sum += loss_sum
        self.diff_sum += diff_sum
        self.sample_cnt += len(self.current_batch)
        self.current_batch = []


In [9]:
wikiexp = WikiLinkingExp('/data/matthew/enwiki-1e7_lines.xml')



In [10]:
len(wikiexp.page_titles)

3921

In [11]:
len(wordvectors.vectors) - len(org_wvectors)

61033

In [12]:
len(wordvectors.reverse_word_location)

282419

In [13]:
new_words = list(set(wordvectors.vectors) - org_wvectors)

In [14]:
new_words[:200]

['michael_gross__lrb_actor_rrb_',
 'rhys_fawr_ap_maredudd',
 'imagenm142_x_3jpg',
 'categorykings_of_rome',
 'filealtair_8800_computerjpg',
 'httpbabylon5warnerbroscom',
 'filesc_oak_oti601_mozartjpg',
 'common_lawcontrasts_between_common_law_and_civil_law_systems',
 'Anatoly Karpov',
 u'afi039s_100_years100_movies',
 'filered_wine_glassjpg',
 'fileleredoutablephotojpg',
 'robert_walker__lrb_comics_rrb_',
 'parabolic_arc',
 'spartak_stadium__lrb_kyrgyzstan_rrb_',
 'categorytreaties_of_seychelles',
 'Convention (norm)',
 'terrace__lrb_stadium_rrb_',
 'certified_credit_professional',
 'fileelectron_dotsvg',
 'categoryantiradiation_missiles_of_the_united_states',
 'filebaburnama_illustrationjpg',
 'filea_chronicle_of_england__page_050__alfred_in_the_neatherds_cottagejpg',
 'khomska',
 u'college_of_cambridge',
 'possession_island__lrb_queensland_rrb_',
 u'burnside039s_lemma',
 u'boot_money',
 'filelancaster_617_sqn_raf_dropping_grand_slam_bomb_on_arnsberg_viaduct_1945jpg',
 'filehkljpg',
 

In [None]:
wordvectors['michael_gross__lrb_actor_rrb_']

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [None]:
wikiexp.train()

In [None]:
for i in xrange(15):
    wikiexp.train()

In [19]:
wikiexp.sample_cnt

1540000

In [18]:
wikiexp.train_res

[]

In [19]:
wikiexp.sample_cnt

620000

In [20]:
wikiexp.loss_sum / wikiexp.sample_cnt

0.70413663586989894

In [21]:
float(wikiexp.diff_sum) / wikiexp.sample_cnt

0.5227766233766233

In [21]:
len(wordvectors.vectors) - 4850513

2010488

In [23]:
wordvectors.reverse_word_location[:200]

[None,
 'anarchism',
 'is',
 'a',
 'political',
 'philosophy',
 'that',
 'advocates',
 'stateless',
 'societies',
 'often',
 'defined',
 'as',
 'selfgoverned',
 'voluntary',
 'institutions',
 'but',
 'several',
 'authors',
 'have',
 'more',
 'specific',
 'based',
 'on',
 'nonhierarchical',
 'free',
 'associations',
 'holds',
 'the',
 'state',
 'to',
 'be',
 'undesirable',
 'unnecessary',
 'or',
 'harmfulthe',
 'following',
 'sources',
 'cite',
 'while',
 'antistatism',
 'central',
 'entails',
 'opposing',
 'authority',
 'hierarchical',
 'organisation',
 'in',
 'conduct',
 'of',
 'human',
 'relations',
 'including',
 'not',
 'limited',
 'systemas',
 'subtle',
 'and',
 'antidogmatic',
 'draws',
 'many',
 'currents',
 'thought',
 'strategy',
 'does',
 'offer',
 'fixed',
 'body',
 'doctrine',
 'from',
 'single',
 'particular',
 'world',
 'view',
 'instead',
 'fluxing',
 'flowing',
 'there',
 'are',
 'types',
 'traditions',
 'all',
 'which',
 'mutually',
 'exclusive',
 'anarchist',
 'school