In [1]:
import os
import gensim
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import time


In [17]:
def read_corpus(docs, tokens_only=False):
    p = PorterStemmer()
    t = time.time()
    for i, line in enumerate(docs):
        line = remove_stopwords(line)
        line = p.stem_sentence(line)
        tokens = gensim.utils.simple_preprocess(line)
        if i %10000==0:
            print(i, time.time()-t)
        if tokens_only:
            yield ' '.join(tokens)
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
        

In [8]:
with open('../data/all_text.txt','r', encoding="utf8") as f:
    data = f.read().split(' ---------- ')

In [19]:
#stem/stop word cleaning/tokenizing before saving
tokenized_corpus = list(read_corpus(data,tokens_only=True))
lined_data = '\n'.join(tokenized_corpus)
with open('../data/lined_all_text.txt','w', encoding="utf8") as f:
    f.write(lined_data)

0 0.0020020008087158203
10000 19.860074758529663
20000 40.06347036361694
30000 59.58175325393677
40000 79.59349918365479
50000 98.53574657440186
60000 118.30273652076721
70000 138.0171136856079
80000 157.83890867233276
90000 177.08442401885986
100000 197.218670129776
110000 216.9371201992035


In [2]:
model = Doc2Vec(vector_size=1024,
                epochs=20,
                workers=12,#8
                window=5,
                min_count=5,
                sample=1e-5,
                negative=5,
                dm=1)

In [3]:
model.build_vocab(corpus_file='../data/lined_all_text.txt')

In [4]:
t = time.time()
model.train(corpus_file='../data/lined_all_text.txt',
            #train_corpus,
            total_words=model.corpus_total_words,
            #total_examples=model.corpus_count,
           epochs=model.epochs)
print(time.time()-t)

382.78553795814514


In [10]:
vector = model.infer_vector(data[5454].split(' '))

In [11]:
vector

array([ 0.02453034,  0.12518424,  0.5898173 , ..., -0.20886329,
        0.23093084,  0.29921034], dtype=float32)

In [12]:
model.save("../models/doc2vec.model")