In [1]:
import sys
sys.path.append('../')

from config import corpus_path

import cohesion_tokenizer
from cohesion_tokenizer import CohesionScore
from cohesion_tokenizer import WordSequenceGraph

In [2]:
print('corpus name = {}'.format(corpus_path.split('/')[-1]))

corpus name = 2016-10-20_article_all_normed.txt


In [3]:
with open(corpus_path, encoding='utf-8') as f:
    sents = [sent.strip() for doc in f for sent in doc.split('  ')]
print('num sents = {}'.format(len(sents)))

num sents = 223435


In [4]:
import pickle
# cohesion = CohesionScore()
# cohesion.train(sents)
# with open('./cohesion.pkl', 'wb') as f:
#     pickle.dump(cohesion, f)
with open('./cohesion.pkl', 'rb') as f:
    cohesion = pickle.load(f)

In [5]:
for word in '아이오아이 뉴스 대통령 아이돌'.split():
    print('{} = {}'.format(word, cohesion[word]))

아이오아이 = 0.30039353721795514
뉴스 = 0.4855283438945025
대통령 = 0.2919357101768181
아이돌 = 0.09792934452242717


In [6]:
word = '아이오아이'
n = len(word)
for i in range(1, n+1):
    print('{} = {}'.format(word[:i], cohesion[word[:i]]))

아 = 0
아이 = 0.14807442926505623
아이오 = 0.0962206918738307
아이오아 = 0.20118122417377401
아이오아이 = 0.30039353721795514


In [7]:
cost_func = lambda word, word_score: len(word) * (1 - word_score)
graph = WordSequenceGraph(cohesion)

sent = '아이오아이가음악방송에출연했습니다'
edges = graph.as_graph(sent)

In [8]:
from shortestpath import list_to_dict_graph

g = list_to_dict_graph(edges)

In [9]:
g[('BOS', )]

{('아', 0, 1): 1,
 ('아이', 0, 2): 1.7038511414698876,
 ('아이오', 0, 3): 2.7113379243785083,
 ('아이오아', 0, 4): 3.195275103304904,
 ('아이오아이', 0, 5): 3.4980323139102243,
 ('아이오아이가', 0, 6): 4.6662161132798365}

In [10]:
for word in '음악방송 음악 방송'.split():
    print('{} = {}'.format(word, cohesion[word]))

음악방송 = 0.23551949668494931
음악 = 0.3330166270783848
방송 = 0.3180379746835443


In [11]:
from shortestpath import ford

bos = ('BOS',)
eos = ('EOS',)
ford(g, bos, eos)

{'cost': 15.093501943039843,
 'paths': [[('BOS',),
   ('아이오아이', 0, 5),
   ('가', 5, 6),
   ('음악', 6, 8),
   ('방송', 8, 10),
   ('에', 10, 11),
   ('출연', 11, 13),
   ('했습니다', 13, 17),
   ('EOS',)]]}

In [12]:
from cohesion_tokenizer import CohesionGraphWordSegmentor

word_segmentor = CohesionGraphWordSegmentor()
word_segmentor.train(sents)

training was done with 223435 sents... 


In [13]:
word_segmentor.is_trained

True

In [14]:
word_segmentor.tokenize(sent)

{'cost': 15.093501943039843,
 'paths': [[('BOS',),
   ('아이오아이', 0, 5),
   ('가', 5, 6),
   ('음악', 6, 8),
   ('방송', 8, 10),
   ('에', 10, 11),
   ('출연', 11, 13),
   ('했습니다', 13, 17),
   ('EOS',)]]}

## Comparison with soynlp MaxScoreTokenizer

In [15]:
import sys
sys.path.append('/mnt/lovit/git/soynlp/')

import soynlp
print(soynlp.__version__)

from soynlp.tokenizer import MaxScoreTokenizer

tokenizer = MaxScoreTokenizer(scores = cohesion._scores)
tokenizer.tokenize(sent)

0.0.46


['아이오아이', '가', '음악', '방송', '에', '출연', '했습니다']

In [16]:
tokenizer.tokenize(sent, flatten=False)

[[('아이오아이', 0, 5, 0.30039353721795514, 5),
  ('가', 5, 6, 0.0, 1),
  ('음악', 6, 8, 0.3330166270783848, 2),
  ('방송', 8, 10, 0.3180379746835443, 2),
  ('에', 10, 11, 0.0, 1),
  ('출연', 11, 13, 0.271523178807947, 2),
  ('했습니다', 13, 17, 0.3898437024326576, 4)]]

## Replace cohesion with noun score

In [18]:
from soynlp.noun import LRNounExtractor_v2

noun_extractor = LRNounExtractor_v2()
nouns = noun_extractor.train_extract(sents)

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=1260, neg=1173, common=12
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 402962 from 223435 sents. mem=0.378 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. mem=1.104 Gb
[Noun Extractor] batch prediction was completed for 146445 words
[Noun Extractor] checked compounds. discovered 193 compounds
[Noun Extractor] postprocessing detaching_features : 51655 -> 51471
[Noun Extractor] postprocessing ignore_features : 51471 -> 51368
[Noun Extractor] postprocessing ignore_NJ : 51368 -> 50999
[Noun Extractor] 50999 nouns (193 compounds) with min count=1
[Noun Extractor] flushing was done. mem=1.290 Gb                    
[Noun Extractor] 74.55 % eojeols are covered


In [19]:
for word in '아이오아이 음악방송 음악 방송 출연'.split():
    print('{} = {}'.format(word, nouns.get(word, (0, 0))))

아이오아이 = NounScore(frequency=250, score=1.0)
음악방송 = NounScore(frequency=55, score=1.0)
음악 = NounScore(frequency=847, score=0.992)
방송 = NounScore(frequency=2848, score=0.9988228369629194)
출연 = NounScore(frequency=2294, score=1.0)


In [20]:
noun_scores = {noun:score.score for noun, score in nouns.items()}
noun_tokenizer = MaxScoreTokenizer(scores = noun_scores)
noun_tokenizer.tokenize(sent, flatten=False)

[[('아이오아이', 0, 5, 1.0, 5),
  ('가', 5, 6, 0.0, 1),
  ('음악방송', 6, 10, 1.0, 4),
  ('에', 10, 11, 0.0, 1),
  ('출연', 11, 13, 1.0, 2),
  ('했습니다', 13, 17, 0.0, 4)]]

## Merge noun score and cohesion

### graph segmentation

In [22]:
merge_noun_cohesion = {word:score for word, score in noun_scores.items()}
for word, cohesion in cohesion._scores.items():
    merge_noun_cohesion[word] = merge_noun_cohesion.get(word, 0) + cohesion

word_segmentor.cohesion._scores = merge_noun_cohesion

In [23]:
word_segmentor.tokenize(sent)

{'cost': 4.1118562691140035,
 'paths': [[('BOS',),
   ('아이오아이', 0, 5),
   ('가', 5, 6),
   ('음악', 6, 8),
   ('방송', 8, 10),
   ('에', 10, 11),
   ('출연', 11, 13),
   ('했습니다', 13, 17),
   ('EOS',)]]}

## max score tokenizer

In [24]:
noun_cohesion_tokenizer = MaxScoreTokenizer(scores = merge_noun_cohesion)
noun_cohesion_tokenizer.tokenize(sent, flatten=False)

[[('아이오아이', 0, 5, 1.3003935372179551, 5),
  ('가', 5, 6, 0.0, 1),
  ('음악', 6, 8, 1.3250166270783847, 2),
  ('방송', 8, 10, 1.3168608116464637, 2),
  ('에', 10, 11, 0.0, 1),
  ('출연', 11, 13, 1.271523178807947, 2),
  ('했습니다', 13, 17, 0.3898437024326576, 4)]]