# based on https://medium.com/@klintcho/doc2vec-tutorial-using-gensim-ab3ac03d3a1#.9pqcww5sb

In [1]:
import json
import os
import requests
import IPython
import ipykernel as kernel
connection_file_path = kernel.get_connection_file()
connection_file = os.path.basename(connection_file_path)
kernel_id = connection_file.split('-', 1)[1].split('.')[0]

def executeCell(x=0):
    ''' executes the code in cell no 'x' (zero-based indexing)
    '''
    sessions = requests.get('http://127.0.0.1:8888/api/sessions').json()
    ipynbFileName = ""
    for sess in sessions:
        if sess['kernel']['id'] == kernel_id:
            ipynbFileName = sess['notebook'][u'path']
            ipynbFileName = ipynbFileName.split(os.sep)[-1]
            break

    # read this notebook's file
    if ipynbFileName != "":
        with open(ipynbFileName) as f:
            nb = json.load(f)
    
    # locate cell's code
    if type(nb) == dict:
        try:
            code = ""
            if nb[u'cells'][x][u'cell_type'] == u'code':
                for s in nb[u'cells'][x]['source']:
                    code += s
            else:
                raise TypeError("The cell you request is not of type 'code'")
        except IndexError:
            raise IndexError('No cell #' + str(x))
    # execute
    get_ipython().run_cell(code)

# Load labes and data

In [None]:
'''
import json
import os
from os.path import isfile, join

wd = 'samples'

# lables
docLabels = []
docLabels = [f for f in os.listdir(wd) if f.endswith('.json')]

# data
data = []
for doc in docLabels:
    with open(os.path.join(wd, doc), 'r') as f:
        data.append(json.load(f))
'''

In [2]:
import json
import os
from os.path import isfile, join


def loadItems(wd = 'samples'):
    docLabels = []
    docLabels = [f for f in os.listdir(wd) if f.endswith('.json')]

    for doc in docLabels:
        with open(os.path.join(wd, doc), 'r') as f:
            yield json.load(f)    

# Class that yields _LabeledSentence_ objects

In [3]:
import re
from textblob import TextBlob
from nltk.corpus import stopwords

def normalize(text, lang='en'):
    # remove some symbols
    text = re.sub(r'[,.;:]', r'', text)

    # stopword removal
    langMapping = [('de', 'german'), ('en', 'english'), ('es', 'spanish'), ('fr', 'french')]
    langFound = None
    for map in langMapping:
        if lang == map[0]:
            langFound = map[1]
    if langFound is not None:
        wordlist = [w for w in text.split() if w not in stopwords.words(langFound)]
        text = " ".join(wordlist)

    # drop too long and too short words
    lower = 4
    upper = 20
    wordlist = [w for w in text.split() if len(w) >= lower and len(w) < upper]
    text = " ".join(wordlist)   

    # singularize
    try:
        wordlist = TextBlob(text)
        text = ' '.join(wordlist.words.singularize())
    except:
        pass

    return text

In [4]:
from gensim.models.doc2vec import LabeledSentence
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

class LabeledDocument(object):
    def __init__(self, docGenerator, lang='en', normalizeText=True, workers=1):
        self.docGenerator = docGenerator
        self.lang = lang
        self.normalizeText = normalizeText
        self.workers = workers
        
    def __iter__(self):
        p = ProcessPoolExecutor(self.workers)
        futures = {}
        for doc in self.docGenerator:
            if 'lang' not in doc or 'filename' not in doc \
            or doc['lang'] != self.lang or 'plaintext' not in doc:
                continue
                
            text = doc['plaintext']
            filename = doc['filename']
            if not self.normalizeText:
                yield LabeledSentence(words=text, tags=filename)
            else:
                futures[filename] = p.submit(normalize, text, lang=doc['lang'])
        
        if self.normalizeText:
            # import pdb; pdb.set_trace()
            for k, v in futures.items():
                yield LabeledSentence(words=v.result(), tags=[k])  

In [6]:
%%time
import gensim

it = LabeledDocument(loadItems(), workers=int(multiprocessing.cpu_count()/1.))
model = gensim.models.Doc2Vec(size=300, window=10, min_count=3, workers=8, alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(it)

for i, epoch in enumerate(range(10)):
    print('beginning interation #' + str(i))
    it = LabeledDocument(loadItems(), workers=int(multiprocessing.cpu_count()/1.))
    model.train(it)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

print('done')

beginning interation #0
beginning interation #1


KeyboardInterrupt: 

In [None]:
model.save('doc2vec.model')

In [None]:
model.most_similar(positive=['market'])

In [None]:
model.similarity('bank', 'money')