In [1]:
import sys
sys.path.append('/mnt/lovit/git/soynlp/')

import soynlp
from soynlp.utils import EojeolCounter
from soynlp.utils import DoublespaceLineCorpus
from konlpy.tag import Komoran

corpus_path = '/mnt/lovit/works/fastcampus_text_ml/1st/data/corpus_10days/news/2016-10-20_article_all_normed.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

# for loading corpus on buffer memory
for sent in corpus:
    continue

In [2]:
# for load komoran
komoran = Komoran()
komoran.pos('테스트문장입니다')

[('테스트', 'NNG'), ('문장', 'NNG'), ('이', 'VCP'), ('ㅂ니다', 'EC')]

In [5]:
import time
process_time = time.time()
for i, sent in enumerate(corpus):
    komoran.pos(sent)
    if i % 1000 == 999:
        print('\rprocessing ... {} sents'.format(i+1), end='')
process_time = time.time() - process_time
print('\rbase model process time = {} sec'.format(process_time))

base model process time = 121.86695957183838 sec


In [6]:
eojeol_counter = EojeolCounter(corpus)

In [11]:
from collections import defaultdict

class PreanalyzedTagger:
    def __init__(self, base_tagger):
        self.base_tagger = base_tagger
        self.preanalyzed = {}

    def pos(self, sent):
        return [word for eojeol in sent.split() for word in self._pos(eojeol)]

    def _pos(self, eojeol):
        if eojeol in self.preanalyzed:
            return self.preanalyzed[eojeol]
        else:
            words = tuple(self.base_tagger.pos(eojeol))
            self.preanalyzed[eojeol] = words
            return words

preanalyzed_tagger = PreanalyzedTagger(komoran)

process_time = time.time()
for i, sent in enumerate(corpus):
    preanalyzed_tagger.pos(sent)
    if i % 1000 == 999:
        print('\rprocessing ... {} sents'.format(i+1), end='')
process_time = time.time() - process_time
print('\rpreanalyzed model process time = {} sec'.format(process_time))

preanalyzed model process time = 40.99457621574402 sec


In [12]:
process_time = time.time()
for i, sent in enumerate(corpus):
    preanalyzed_tagger.pos(sent)
    if i % 1000 == 999:
        print('\rprocessing ... {} sents'.format(i+1), end='')
process_time = time.time() - process_time
print('\rall eojeol analyzed model process time = {} sec'.format(process_time))

all eojeol analyzed model process time = 3.2747089862823486 sec


In [13]:
answers = [komoran.pos(sent) for sent in corpus]
tests = [preanalyzed_tagger.pos(sent) for sent in corpus]

In [15]:
len(answers), len(tests)

(223357, 223357)

In [19]:
n_recovery = 0
n_difference = 0
for answer, test in zip(answers, tests):
    test = set(test)
    for answer_i in answer:
        if answer_i in test:
            n_recovery += 1
        else:
            n_difference += 1
print(n_recovery / (n_recovery + n_difference))

0.9938027020710959
