In [72]:
import re

class WikipediaReader(object):
    
    title_rg = re.compile('.*<title>(.*)</title>.*')
    link_rg = re.compile('\[\[([^\]]*)\]\]')
    redirect_rg = re.compile('.*<redirect title="(.*)" />')
    
    def __init__(self, fname):
        self.fname = fname
        
    def read(self):
        current_page = None
        look_for_next_page = True
        page_text = None

        title_rg = self.title_rg
      
        with open(self.fname) as f:
            try:
                while True:
                    line = f.next()
                    if look_for_next_page:
                        if '<page>' not in line:
                            continue
                        else:
                            look_for_next_page = False
                    if '<title>' in line:
                        current_page = title_rg.match(line).group(1)
                    elif '<redirect' in line:
                        redirect_page = self.redirect_rg.match(line).group(1)
                        self.readRedirect(current_page, redirect_page)
                        look_for_next_page = True
                    elif '<text' in line:
                        lines = [ line[line.index('>')+2:] ]
                        while True:
                            line = f.next()
                            if '</text>' in line:
                                lines.append(line[:line.index('</text>')])
                                look_for_next_page = True
                                page_text = '\n'.join(lines)
                                self.readPage(current_page, page_text)
                                break
                            else:
                                lines.append(line)
            except StopIteration as e:
                pass
            
    def getLinkTargets(self, content):
        ret = self.link_rg.findall(content)
        def s(v):
            a = v.split('|')
            pg = a[0].replace(' ', '_').lower()
            txt = a[-1]
            return pg, txt
        return [s(r) for r in ret]
            
    def readPage(self, title, content):
        pass
    
    def readRedirect(self, title, target):
        pass
        


In [2]:
from wordvecs import WordVectors, EmbeddingLayer

#wordvectors = WordVectors(fname='../GoogleNews-vectors-negative300.bin', negvectors=False)
wordvectors = WordVectors(fname='../enwiki-20141208-pages-articles-multistream-links-output5.bin', negvectors=True)

Couldn't import dot_parser, loading of dot files will not be possible.


In [3]:
len(wordvectors.vectors)

4850513

In [77]:
import random

class WikiLinkingExp(WikipediaReader):
    
    wiki_re = [
        (re.compile('&amp;'), '&'),
        (re.compile('&lt;'), '<'),
        (re.compile('&gt;'), '>'),
        (re.compile('<ref[^<]*<\/ref>'), ''),
        (re.compile('<.*?>'), ''),
        (re.compile('\[http:[^\] ]*'), ''),
        (re.compile('\|(thumb|left|right|\d+px)', re.IGNORECASE), ''),
        (re.compile('\[\[image:[^\[\]]*\|', re.IGNORECASE), ''),
        (re.compile('\[\[category:([^|\]]*)[^]]*\]\]', re.IGNORECASE), '\\1'),
        (re.compile('\[\[[a-z\-]*:[^\]]\]\]'), ''),
        (re.compile('\[\[[^\|\]]*\|'), '[['),
        (re.compile('{{[^}]*}}'), ''),
        (re.compile('{[^}]*}'), ''),
        (re.compile('[\[|\]]'), ''),
        (re.compile('&[^;]*;'), ' '),
        (re.compile('[^a-zA-Z0-9 ]'), ''),
        (re.compile('\n+'), ' ')
        # TODO: clean up some remaining issues with parsing the wiki text
    ]
    
    @classmethod
    def _wikiToText(cls, txt):
        txt = txt.lower()
        for r in cls.wiki_re:
            txt = r[0].sub(r[1], txt)
        return txt
    
    run_training = False
    num_words_to_use = 200
    batch_size = 500
    num_negative_samples = 1
    
    def __init__(self, fname, wordvecs=wordvectors):
        super(WikiLinkingExp, self).__init__(fname)
        self.wordvecs = wordvecs
        self.current_batch = []
        self.page_titles = set()
        
        # do an inital load of the data
        self.read()
        
    
    def train(self):
        self.run_training = True
        self.read()
        
    t_content = []
        
    def readPage(self, title, content):
        # would be nice to use tf-idf here for the words from the document that should look at, but then won't have that much meanning....
        links = self.getLinkTargets(content)
        words = self._wikiToText(content).split()[:self.num_words_to_use]
        self.t_words = words
        self.t_content.append(content)
        wordsv = self.wordvecs.tokenize(words)
        self.t_wordsv = wordsv
        titlev = self.wordvecs.get_location(title)
        linksv = self.wordvecs.tokenize(links)
        if self.run_training:
            for l in linksv:
                self.current_batch.append((titlev, wordsv, l, 1))
            for l in random.sample(self.page_titles, len(linksv)*self.num_negative_samples):
                self.current_batch.append((titlev, wordsv, l, 0))
            
            if len(self.current_batch) >= self.batch_size:
                pass  # TODO:
        else:
            self.page_titles.add(titlev)
        
    def train_batch(self):
        print self.current_batch[0]
        self.current_batch = []
            

In [78]:
wikiexp = WikiLinkingExp('../enwiki-test-small.xml')

In [79]:
wikiexp.train()

In [70]:
len(wikiexp.current_batch)

5100

In [83]:
wikiexp.current_batch[0]

(49,
 [10,
  127,
  13,
  584,
  1423,
  364,
  1424,
  1425,
  1426,
  1427,
  1172,
  448,
  1428,
  1429,
  1430,
  222,
  364,
  1222,
  1431,
  19,
  1172,
  448,
  132,
  1432,
  1430,
  950,
  22,
  1433,
  1434,
  1435,
  10,
  1436,
  23,
  445,
  217,
  516,
  1437,
  1438,
  42,
  1439,
  1440,
  1441,
  1442,
  10,
  448,
  13,
  584,
  1423,
  45,
  1443],
 50,
 1)

In [82]:
WikiLinkingExp._wikiToText(wikiexp.t_content[0]).split()[:200]

['anarchism',
 'is',
 'a',
 'political',
 'philosophy',
 'that',
 'advocates',
 'stateless',
 'societies',
 'often',
 'defined',
 'as',
 'selfgoverned',
 'voluntary',
 'institutions',
 'but',
 'that',
 'several',
 'authors',
 'have',
 'defined',
 'as',
 'more',
 'specific',
 'institutions',
 'based',
 'on',
 'nonhierarchical',
 'free',
 'associations',
 'anarchism',
 'holds',
 'the',
 'state',
 'to',
 'be',
 'undesirable',
 'unnecessary',
 'or',
 'harmfulthe',
 'following',
 'sources',
 'cite',
 'anarchism',
 'as',
 'a',
 'political',
 'philosophy',
 'while',
 'antistatism',
 'is',
 'central',
 'anarchism',
 'entails',
 'opposing',
 'authority',
 'or',
 'hierarchical',
 'organisation',
 'in',
 'the',
 'conduct',
 'of',
 'human',
 'relations',
 'including',
 'but',
 'not',
 'limited',
 'to',
 'the',
 'state',
 'systemas',
 'a',
 'subtle',
 'and',
 'antidogmatic',
 'philosophy',
 'anarchism',
 'draws',
 'on',
 'many',
 'currents',
 'of',
 'thought',
 'and',
 'strategy',
 'anarchism',
 'd