In [30]:
from multiprocessing import Pool, cpu_count
import numpy as np
from gensim.models import doc2vec
from smart_open import smart_open


def split_words(text):
    return text.split()


def load_corpus(corpus_file, doc2vec_format=True):
    """Returns articles from corpus. If doc2vec_format is choosen, then articles are converted
    to TaggedDocument in order to meet doc2vec's required format."""
    with smart_open(corpus_file, 'r+') as corpus, Pool(cpu_count()) as p:
        for index, article in enumerate(p.imap(split_words, corpus)):
            yield doc2vec.TaggedDocument(article, [index]) if doc2vec_format else article

def train_vectors(path, vector_size=5, min_count=10, epochs=30, local_context=150, workers=16):
    corpus = list(load_corpus(path))
    model150 = doc2vec.Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs, 
                               window=local_context, workers=workers)
    model150.build_vocab(corpus)
    model150.train(corpus, total_examples=model150.corpus_count, epochs=model150.epochs)
    model150.save('model150')