**Labels pool preparation**

Labels pool created from a raw Wikipedia dump
Wikipedia portals (article with title starting from Portal:)

In [2]:
# download raw Wikipedia dump
!wget --continue https://dumps.wikimedia.org/enwiki/20180301/enwiki-20180301-pages-articles.xml.bz2

--2018-04-02 04:55:31--  https://dumps.wikimedia.org/enwiki/20180301/enwiki-20180301-pages-articles.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.11, 2620:0:861:1:208:80:154:11
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14833155655 (14G) [application/octet-stream]
Saving to: ‘enwiki-20180301-pages-articles.xml.bz2’


2018-04-02 06:56:46 (1.94 MB/s) - ‘enwiki-20180301-pages-articles.xml.bz2’ saved [14833155655/14833155655]



In [None]:
# create labels pool
# from Wikipedia Portal titles
import bz2
import gensim

# wikipedia dump
f = bz2.BZ2File('enwiki-20180301-pages-articles.xml.bz2')

# create iterator traversing on pages
pages = gensim.corpora.wikicorpus.extract_pages(f,('0',))

# extract page titles starting from Portal: and prune the titles
ns = 'Portal:'
labels = [title[len(ns):].split('/')[0].lower().strip('\n') for title,_ in pages if title.startswith(ns)]

# remove duplicates
labels = list(set(labels))

# save Labels to a file
with open('labels.txt', 'w', encoding='utf-8') as labels_file:
    for label in labels:
        labels_file.write(label + '\n')

**Build special word embeddings, as unified embedding space for words ANS labels from Label pool**

Labels from label pool converted to Phrases, i.e. joined by _ delimiter 

In [None]:
# load preprocessed Wikipedia texts as a preparation for word embeddings
import gensim.downloader
from gensim.parsing.preprocessing import preprocess_string

data = gensim.downloader.load("wiki-english-20171001")

In [None]:
# Prepare two temporary lists
# - list of multi-word labels, to replace these labels in the texts
# - list of delimeter (_) joined labels, to search closest embeddings within this list
long_labels = []
labels_vocab = []
with open('labels.txt') as labels_file:
    for line in labels_file:
        label = line.strip('\n\r').decode('utf-8')
      
        if len(label.split(' '))>1:
            long_labels.append(label)
      
        labels_vocab.append('_'.join(label.split()))

In [None]:
# iterator replacing labels in articles with corresponding phrases
#
# traverse articles
#     in each article, find labels from the label pool, and replace them with label phrases
#     yield processed articles
class LabelsToPhrases(object):
    def __init__(self, corpus):
        self.corpus = corpus

    def __iter__(self):
        for article in self.corpus:

            # conctatenate all texts related to an article - article title, section titles, and section texts
            text = article['title'] + ' '.join([ ' ' + section_title + section_text for section_title, section_text in zip(article['section_titles'], article['section_texts'])])

            # clean-up: remove multiple blanks, lower()
            text = ' '.join(text.lower().split())

            # find in an articel text all instances of labels from the labels pool
            # and replace these by corresponding phrases, i.e. one token with words separated by a delimiter
            for label in long_labels:
                text = text.replace( label, '_'.join(label.split()))

            # clean up the resulting text
            yield preprocess_string(text)

In [None]:
# callback to save w2v model on end of each training iteration
from gensim.models.callbacks import CallbackAny2Vec

class EpochSaver(CallbackAny2Vec):
    "Callback to save model after every epoch"
    def __init__(self):
        self.epoch = 0
    
    def on_epoch_end(self, model):
        output_path = 'epoch{}.model'.format(self.epoch)
        print("Save model to {}".format(output_path))
        model.save(output_path)
        self.epoch += 1

epoch_saver = EpochSaver()

In [None]:
# build w2v model including labels
# Approach #1 - straightforward
# TAKES TOO LONG, NEVER ENDS
from gensim.models import Word2Vec, KeyedVectors
from multiprocessing import cpu_count

embedding = gensim.models.Word2Vec( LabelsToPhrases(data), min_count=1, workers=cpu_count(), callbacks=[epoch_saver])   
word_vectors = embedding.wv

# save Embedding model
word_vectors.save('embeddings')

In [None]:
# build w2v model including labels
# Approach #2 - load vocab from a pre-trained w2v model
# PROBLEM: reset_from raises 'Word2VecKeyedVectors' object has no attribute 'vocabulary'
from gensim.models import Word2Vec, KeyedVectors
from multiprocessing import cpu_count

ref_w2v = gensim.downloader.load("word2vec-google-news-300")

embedding = gensim.models.Word2Vec(min_count=1, workers=cpu_count(), callbacks=[epoch_saver])  
embedding.reset_from(ref_w2v)
embedding.build_vocab( [labels_vocab], update=True )  
embedding.train( LabelsToPhrases(data) )  

word_vectors = embedding.wv

# save Embedding model
word_vectors.save('embeddings')

**Test the model**

In [None]:
import numpy as np

def phrases(sents):
    for sent in sents:
        yield '_'.join(sent)

topn = 10
topic_words = ['venezuela', 'equador', 'colombia', 'brazil']
topic_vector = reduce(lambda a,b: a+b, map(word_vectors.get_vector, topic_words)) / len(topic_words)

dists = distances( topic_vector, other_entities=phrases(labels))

for index in np.argsort(dists)[:topn]:
    print( labels[index] )