# based on https://medium.com/@klintcho/doc2vec-tutorial-using-gensim-ab3ac03d3a1#.9pqcww5sb

In [None]:
import json
import os
import requests
import IPython
import ipykernel as kernel
connection_file_path = kernel.get_connection_file()
connection_file = os.path.basename(connection_file_path)
kernel_id = connection_file.split('-', 1)[1].split('.')[0]

def executeCell(x=0):
    ''' executes the code in cell no 'x' (zero-based indexing)
    '''
    sessions = requests.get('http://127.0.0.1:8888/api/sessions').json()
    ipynbFileName = ""
    for sess in sessions:
        if sess['kernel']['id'] == kernel_id:
            ipynbFileName = sess['notebook'][u'path']
            ipynbFileName = ipynbFileName.split(os.sep)[-1]
            break

    # read this notebook's file
    if ipynbFileName != "":
        with open(ipynbFileName) as f:
            nb = json.load(f)
    
    # locate cell's code
    if type(nb) == dict:
        try:
            code = ""
            if nb[u'cells'][x][u'cell_type'] == u'code':
                for s in nb[u'cells'][x]['source']:
                    code += s
            else:
                raise TypeError("The cell you request is not of type 'code'")
        except IndexError:
            raise IndexError('No cell #' + str(x))
    # execute
    get_ipython().run_cell(code)

## Define a logger

In [None]:
import logging
logging.getLogger().setLevel(logging.INFO)

# Load labes and data

In [None]:
import json
import os
from os.path import isfile, join
from scandir import scandir


class loadItems(object):
    def __init__(self, wd = 'samples'):
        self.wd = wd

    def __iter__(self):
        for doc in scandir(self.wd):
            if doc.is_file() and doc.name.endswith('.json'):
                with open(os.path.join(self.wd, doc.name), 'r') as f:
                    yield json.load(f)   

# Class that yields _LabeledSentence_ objects

In [None]:
import re
from textblob import TextBlob
from nltk.corpus import stopwords

def normalize(text, lang='en'):
    # remove some symbols
    text = re.sub(r'[,.;:]', r'', text)

    # stopword removal
    langMapping = [('de', 'german'), ('en', 'english'), ('es', 'spanish'), ('fr', 'french')]
    langFound = None
    for map in langMapping:
        if lang == map[0]:
            langFound = map[1]
    if langFound is not None:
        wordlist = [w for w in text.split() if w not in stopwords.words(langFound)]
        text = " ".join(wordlist)

    # drop too long and too short words
    lower = 4
    upper = 20
    wordlist = [w for w in text.split() if len(w) >= lower and len(w) < upper]
    text = " ".join(wordlist)   

    # singularize
    try:
        wordlist = TextBlob(text)
        text = ' '.join(wordlist.words.singularize())
    except:
        pass

    return text

In [None]:
from gensim.models.doc2vec import TaggedDocument
from concurrent.futures import ProcessPoolExecutor
import multiprocessing
from collections import OrderedDict
import os

class LabeledDocument(object):
    def __init__(self, docGenerator, lang='en', normalizeText=True, workers=1):
        self.docGenerator = docGenerator
        self.lang = lang
        self.normalizeText = normalizeText
        self.workers = workers
    
    def mkDir(self, dir):
        if not os.path.exists(dir):
            os.makedirs(dir)
            logging.debug(dir + ' has been created.')
        
    def __iter__(self):
        p = ProcessPoolExecutor(self.workers)
        cacheDir = os.path.join('cache', 'normalizedDocs')
        cacheDict = {}
        futures = OrderedDict()
        
        if os.path.exists(cacheDir):
            keySet = set(os.listdir(cacheDir))
            logging.debug('Read keySet from cache directory')
        else:
            keySet = set()
            self.mkDir(cacheDir)
            logging.debug('Cache is empty. Begin with empty keySet')
        
        for doc in self.docGenerator:
            if 'lang' not in doc or 'filename' not in doc \
            or doc['lang'] != self.lang or 'plaintext' not in doc:
                if 'filename' in doc:
                    logging.debug('Skipped ' + doc['filename'] + '. Missing parameters.')
                else:
                    logging.debug('The following document is missing some parameters: \n' + json.dumps(doc))
                continue
            
            text = doc['plaintext']
            filename = doc['filename']
            
            if not self.normalizeText:
                yield TaggedDocument(words=text.split(), tags=filename)
            else:
                if filename in keySet and os.path.exists(os.path.join(cacheDir, filename)):
                    # the file has already been normalized, let's
                    # read the cache
                    with open(os.path.join(cacheDir, filename)) as fh:
                        logging.debug('Yielded from Cache')
                        yield TaggedDocument(words=json.load(fh).split(), tags=[filename])
                else:
                    futures[filename] = p.submit(normalize, text, lang=doc['lang'])

        if self.normalizeText:
            for k, v in futures.items():
                v = v.result()
                keySet.add(k)
                with open(os.path.join(cacheDir, k), 'w') as fh:
                    json.dump(v, fh)
                    logging.debug('Yielded from Calculation')
                    yield TaggedDocument(words=v.split(), tags=[k])

In [None]:
import gensim

it = LabeledDocument(loadItems(), workers=int(multiprocessing.cpu_count()/1.))
model = gensim.models.Doc2Vec(size=600, window=10, min_count=3, workers=4, alpha=0.025, min_alpha=0.025) # use fixed learning rate00
logging.info('Building vocabulary...')
model.build_vocab(it)

for i, epoch in enumerate(range(10)):
    print('beginning interation #' + str(i))
    model.train(it)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

print('done')

In [None]:
model.save(os.path.join('cache', 'allDocs600D.model'))

In [None]:
model.most_similar(positive=['tax'])

In [None]:
model.docvecs['00195034.pdf']

In [None]:
model.most_similar(positive=model.docvecs['00095.pdf'])

In [6]:
import numpy as gensim
gensim.__version__

'1.10.2'