In [234]:
import sys
from os import listdir, path
from pyknp import Jumanpp
from gensim import models
from gensim.models.doc2vec import LabeledSentence

In [235]:
def corpus_files():
    dirs = [path.join('./tweets/', x)
            for x in listdir('./tweets/') if not x.endswith('.txt')]
    docs = [path.join(x, y)
            for x in dirs for y in listdir(x)]# if not x.startswith('LICENSE')]
    return docs

In [236]:
def read_document(path):
    with open(path, 'r') as f:
        return f.read()

In [237]:
def split_into_words(text):
    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]

In [238]:
def doc_to_sentence(doc, name):
    words = split_into_words(doc)
    return LabeledSentence(words=words, tags=[name])

In [239]:
def corpus_to_sentences(corpus):
    docs   = [read_document(x) for x in corpus]
    for idx, (doc, name) in enumerate(zip(docs, corpus)):
        sys.stdout.write('\r前処理中 {}/{}'.format(idx, len(corpus)))
        yield doc_to_sentence(doc, name)

In [None]:
corpus = corpus_files()
sentences = corpus_to_sentences(corpus)

In [None]:
model = models.Doc2Vec(dm=0, 
                       size=300, 
                       window=15, 
                       alpha=.025, 
                       min_alpha=.025, 
                       min_count=1, 
                       sample=1e-6)
model.build_vocab(sentences)

前処理中 161/19744

In [None]:
print('\n訓練開始')
for epoch in range(20):
    print('Epoch: {}'.format(epoch + 1))
    model.train(sentences,total_examples = len(corpus),epochs = epoch)
    model.alpha -= (0.025 - 0.0001) / 19
    model.min_alpha = model.alpha

In [None]:
model.save('doc2vec.model')
model = models.Doc2Vec.load('doc2vec.model')

In [None]:
model.docvecs.most_similar('./tweets/tweets/onigirichanre1.txt', topn=1)