In [54]:
import sys
import collections
from os import listdir, path
from pyknp import Jumanpp
from gensim import models
from gensim.models.doc2vec import LabeledSentence

In [55]:
def corpus_files():
    dirs = [path.join('./tweets/', x)
            for x in listdir('./tweets/') if not x.endswith('.txt')]
    docs = [path.join(x, y)
            for x in dirs for y in listdir(x)]# if not x.startswith('LICENSE')]
    return docs

In [56]:
def read_document(path):
    with open(path, 'r') as f:
        return f.read()

In [57]:
def split_into_words(text):
    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]

In [58]:
def doc_to_sentence(doc, name):
    words = split_into_words(doc)
    return LabeledSentence(words=words, tags=[name])

In [59]:
def corpus_to_sentences(corpus):
    docs   = [read_document(x) for x in corpus]
    for idx, (doc, name) in enumerate(zip(docs, corpus)):
        sys.stdout.write('\r前処理中 {}/{} {}'.format(idx, len(corpus),name))
        yield doc_to_sentence(doc, name)

In [60]:
corpus = corpus_files()
sentences = corpus_to_sentences(corpus)

In [61]:
model = models.Doc2Vec(size=100, 
                       iter=20,
                       alpha=0.025, 
                       min_count=5,
                      workers=4)
model.build_vocab(sentences)

前処理中 7492/7493 ./tweets/onigirichan/onigirichan999.txt

In [62]:
print('\n訓練開始')
for epoch in range(30):
    print('Epoch: {}'.format(epoch + 1))
    model.train(sentences,total_examples = len(corpus),epochs = epoch)
    #model.alpha -= (0.025 - 0.0001) / 19
    #model.min_alpha = model.alpha


訓練開始
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30


In [63]:
model.save('doc2vec.model')
model = models.Doc2Vec.load('doc2vec.model')

In [64]:
result = model.docvecs.most_similar('./tweets/onigirichan/onigirichan1.txt')
#result = model.most_similar(positive=["愛"])
print(result)

[('./tweets/kuzugate/kuzugate1765.txt', 0.3781484067440033), ('./tweets/onigirichan/onigirichan90.txt', 0.3621360659599304), ('./tweets/kuzugate/kuzugate726.txt', 0.3575366735458374), ('./tweets/kuzugate/kuzugate969.txt', 0.3327537477016449), ('./tweets/b1cute/b1cute2074.txt', 0.33220329880714417), ('./tweets/b1cute/b1cute1781.txt', 0.3313763439655304), ('./tweets/kuzugate/kuzugate1424.txt', 0.32063692808151245), ('./tweets/kuzugate/kuzugate1618.txt', 0.317220538854599), ('./tweets/kuzugate/kuzugate211.txt', 0.30157965421676636), ('./tweets/onigirichan/onigirichan1118.txt', 0.30070436000823975)]
