In [14]:
import os
import gensim
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import time


In [2]:
def read_corpus(docs, tokens_only=False):
    p = PorterStemmer()
    t = time.time()
    for i, line in enumerate(docs):
        line = remove_stopwords(line)
        line = p.stem_sentence(line)
        tokens = gensim.utils.simple_preprocess(line)
        if i %10000==0:
            print(i, time.time()-t)
        if tokens_only:
            yield ' '.join(tokens)
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
        

In [3]:
#data was in improper format before
with open('../data/all_text.txt','r', encoding="utf8") as f:
    data = f.read().split(' ---------- ')

In [4]:
#stem/stop word cleaning/tokenizing before saving
tokenized_corpus = list(read_corpus(data,tokens_only=True))
lined_data = '\n'.join(tokenized_corpus)
with open('../data/lined_all_text.txt','w', encoding="utf8") as f:
    f.write(lined_data)

0 0.002005338668823242
10000 19.233164072036743
20000 38.830894470214844
30000 57.575406074523926
40000 76.97895216941833
50000 95.20600819587708
60000 114.19446158409119
70000 133.2357108592987
80000 152.27474641799927
90000 170.84954595565796
100000 190.1770462989807
110000 209.168292760849


In [2]:
model = Doc2Vec(vector_size=1024,
                epochs=20,
                workers=12,#8
                window=5,
                min_count=5,
                sample=1e-5,
                negative=5,
                dm=1)

In [3]:
model.build_vocab(corpus_file='../data/lined_all_text.txt')

In [4]:
#using gensims built in load functionality
t = time.time()
model.train(corpus_file='../data/lined_all_text.txt',
            #train_corpus,
            total_words=model.corpus_total_words,
            #total_examples=model.corpus_count,
           epochs=model.epochs)
print(time.time()-t)

382.78553795814514


In [10]:
vector = model.infer_vector(data[5454].split(' '))

In [11]:
vector

array([ 0.02453034,  0.12518424,  0.5898173 , ..., -0.20886329,
        0.23093084,  0.29921034], dtype=float32)

In [12]:
model.save("../models/doc2vec.model")

In [2]:
from gensim.models import Doc2Vec
model = Doc2Vec.load("../models/doc2vec/doc2vec.model")

In [3]:
vector = model.infer_vector(['hey','whats','up'])

In [10]:
with open('../data/lined_all_text.txt','r', encoding="utf8") as f:
    data = f.read()