# Document embeddings

## Some bioler plate code to kick things off

In [2]:
from executeCell import executeCell
import logging
logging.getLogger().setLevel(logging.INFO)

## Load labes and data
Instanciating this class creates an iterable that yields the content of a document as plain python object

In [3]:
import json
import os
from os.path import isfile, join
from scandir import scandir


class loadItems(object):
    def __init__(self, wd = 'samples'):
        self.wd = wd

    def __iter__(self):
        for doc in scandir(self.wd):
            if doc.is_file() and doc.name.endswith('.json'):
                with open(os.path.join(self.wd, doc.name), 'r') as f:
                    yield json.load(f)   

## _normalize_ cleans the text
- removes punctuation
- removes stopwords
- removes too lang and too short words
- sigularizes nouns

In [4]:
import re
from textblob import TextBlob
from nltk.corpus import stopwords

def normalize(text, lang='en'):
    # remove some symbols
    text = re.sub(r'[,.;:]', r' ', text)

    # stopword removal
    langMapping = [('de', 'german'), ('en', 'english'), ('es', 'spanish'), ('fr', 'french')]
    langFound = None
    for map in langMapping:
        if lang == map[0]:
            langFound = map[1]
    if langFound is not None:
        wordlist = [w for w in text.split() if w not in stopwords.words(langFound)]
        text = " ".join(wordlist)

    # drop too long and too short words
    lower = 4
    upper = 20
    wordlist = [w for w in text.split() if len(w) >= lower and len(w) < upper]
    text = " ".join(wordlist)   

    # singularize
    try:
        wordlist = TextBlob(text)
        text = ' '.join(wordlist.words.singularize())
    except:
        pass

    return text

## Class that yields _LabeledSentence_ objects
This class produces iterable objects that generate _LabeledSentence_-object, that can be consumed by the Doc2Vec modelling process. When `normalize` equals `True`, a set of normalization methods is carried out. Since this is computationally intensive, this is done in a parallel manner. The number of worker is defined by the `workers` parameter

In [5]:
from gensim.models.doc2vec import LabeledSentence
from concurrent.futures import ProcessPoolExecutor
import multiprocessing
from collections import OrderedDict
import os

class LabeledDocument(object):
    def __init__(self, docGenerator, lang='en', normalizeText=True, workers=1):
        self.docGenerator = docGenerator
        self.lang = lang
        self.normalizeText = normalizeText
        self.workers = workers
    
    def mkDir(self, dir):
        if not os.path.exists(dir):
            os.makedirs(dir)
            logging.debug(dir + ' has been created.')
        
    def __iter__(self):
        p = ProcessPoolExecutor(self.workers)
        cacheDir = os.path.join('cache', 'normalizedDocs')
        cacheDict = {}
        futures = OrderedDict()
        
        if os.path.exists(cacheDir):
            keySet = set(os.listdir(cacheDir))
            logging.debug('Read keySet from cache directory')
        else:
            keySet = set()
            self.mkDir(cacheDir)
            logging.debug('Cache is empty. Begin with empty keySet')
        
        for doc in self.docGenerator:
            if 'lang' not in doc or 'filename' not in doc \
            or doc['lang'] != self.lang or 'plaintext' not in doc:
                logging.debug('Omitting document. Important parameters missing')
                continue
            
            text = doc['plaintext']
            filename = doc['filename']
            
            if not self.normalizeText:
                yield LabeledSentence(words=text, tags=filename)
            else:
                if filename in keySet and os.path.exists(os.path.join(cacheDir, filename)):
                    # the file has already been normalized, let's
                    # read the cache
                    with open(os.path.join(cacheDir, filename)) as fh:
                        logging.debug('Yielded from Cache')
                        yield LabeledSentence(words=json.load(fh).split(), tags=[filename])
                else:
                    futures[filename] = p.submit(normalize, text, lang=doc['lang'])

        if self.normalizeText:
            for k, v in futures.items():
                v = v.result()
                keySet.add(k)
                with open(os.path.join(cacheDir, k), 'w') as fh:
                    json.dump(v, fh)
                    logging.debug('Yielded from Calculation')
                    yield LabeledSentence(words=v.split(), tags=[k])

INFO:summa.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


## Document embedding
After building the vocabulary the model is trained during the course of ten iterations.

In [None]:
import gensim

logging.info('')
it = LabeledDocument(loadItems(), workers=int(multiprocessing.cpu_count()/1.))
model = gensim.models.Doc2Vec(size=300, window=10, min_count=3, workers=8, alpha=0.025, min_alpha=0.025) # use fixed learning rate
logging.info('Building vocabulary...')
model.build_vocab(it)

for i, epoch in enumerate(range(10)):
    logging.info('beginning interation #' + str(i) + '\n')
    model.train(it)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

logging.info('Persisting model')
model.save(os.path.join('cache', 'doc2vec.model'))
logging.info('done')

INFO:root:
INFO:root:Building vocabulary...
INFO:gensim.models.doc2vec:collecting all words and their counts
DEBUG:root:Read keySet from cache directory
DEBUG:root:Yielded from Cache
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
DEBUG:root:Yielded from Cache
DEBUG:root:The following document is missing some parameters: 
{"identifier_url": ["http://www.econstor.eu/bitstream/10419/113944/1/NDL2015-048.pdf"], "identifier_repec": "RePEc:fem:femwpa:2015.48", "person": ["Jussi Lintunen", "Lintunen, Jussi", "Olli-Pekka Kuusela", "Kuusela, Olli-Pekka"], "citedBy": null, "title": ["Optimal management of markets for bankable emission permits"], "cites": null, "date": ["2015"], "id": "10011307281"}
DEBUG:root:Yielded from Cache
DEBUG:root:The following document is missing some parameters: 
{"identifier_url": ["http://www.econstor.eu/bitstream/10419/107754/1/NDL2015-003.pdf"], "identifier_repec": "RePEc:fem:femwpa:2015.03", "person": ["Claire Gav