In [1]:
import sys
sys.path.append('../')

from crf_postagger import TrigramTagger
from crf_postagger import TrigramParameter

model_path = '../models/trigram_crf_sejong_simple.json'
preanalyzed_eojeols = {
    '해쪄': (('하', '아쪄', 'Verb', 'Eomi'),)
}

trained_crf = TrigramTagger(
    TrigramParameter(
        model_path,
        preanalyzed_eojeols = preanalyzed_eojeols
    )
)

In [2]:
sentence = [['뭐', 'Noun'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['ㅏ', 'Eomi']]
trained_crf.evaluate(sentence)

3.9667383324286725

In [3]:
trained_crf.tag('머리쿵해쪄')

[([('머리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Verb'),
   ('아', 'Eomi'),
   ('찌', 'Verb'),
   ('어', 'Eomi')],
  2.2663016077612634),
 ([('머리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Adjective'),
   ('아', 'Eomi'),
   ('찌', 'Verb'),
   ('어', 'Eomi')],
  2.02389491102375),
 ([('머리', 'Noun'), ('쿵', 'Adverb'), ('하', 'Verb'), ('아쪄', 'Eomi')],
  2.0138780418933813),
 ([('머', 'Exclamation'),
   ('리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Verb'),
   ('아', 'Eomi'),
   ('찌', 'Verb'),
   ('어', 'Eomi')],
  1.1750489760745113),
 ([('머', 'Noun'),
   ('리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Verb'),
   ('아', 'Eomi'),
   ('찌', 'Verb'),
   ('어', 'Eomi')],
  1.1617270448882735)]

In [4]:
trained_crf.add_user_dictionary('Eomi', {'아쪄':1, '아써':1})
trained_crf.tag('머리쿵해쪄')

[([('머리', 'Noun'), ('쿵', 'Adverb'), ('하', 'Verb'), ('아쪄', 'Eomi')],
  3.0138780418933813),
 ([('머리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Verb'),
   ('아', 'Eomi'),
   ('찌', 'Verb'),
   ('어', 'Eomi')],
  2.2663016077612634),
 ([('머리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Adjective'),
   ('아', 'Eomi'),
   ('찌', 'Verb'),
   ('어', 'Eomi')],
  2.02389491102375),
 ([('머', 'Exclamation'),
   ('리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Verb'),
   ('아쪄', 'Eomi')],
  1.922625410206629),
 ([('머', 'Noun'),
   ('리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Verb'),
   ('아쪄', 'Eomi')],
  1.909303479020391)]

In [5]:
trained_crf.tag('머리쿵해쪄', flatten=False, beam_size=2)

[([('머리/Noun', 0, 2, 0.034934023119340685),
   ('쿵/Adverb', 2, 3, 0.003655173321901925),
   ('하/Verb + 아쪄/Eomi', 3, 5, 1.7794701366648282)],
  3.0138780418933813),
 ([('머리/Noun', 0, 2, 0.034934023119340685),
   ('쿵/Adverb', 2, 3, 0.003655173321901925),
   ('하/Verb + 아/Eomi', 3, 4, 1.1916701170597677),
   ('찌/Verb + 어/Eomi', 4, 5, 0.5608400622094903)],
  2.2663016077612634)]

In [6]:
candidates = [
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Noun')],
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Noun')],
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Verb'), ('ㅏ', 'Eomi')],
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Verb'), ('ㅏ', 'Eomi')]
]

for sent in candidates:
    print('\n{}'.format(sent))
    print(trained_crf.evaluate(sent, debug=False))


[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Noun')]
2.037970832351861

[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Noun')]
-0.26347716229917634

[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Verb'), ('ㅏ', 'Eomi')]
3.9667383324286725

[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Verb'), ('ㅏ', 'Eomi')]
1.684868477842624


In [10]:
trained_crf.add_user_dictionary('Noun', {'아이오아이':1, '아이돌룸':1})
trained_crf.add_user_dictionary('Verb', {'나오':0.5})
sent = '아이돌룸에아이오아이가나올수있을까'

trained_crf._a_syllable_penalty = -0.3
trained_crf._noun_preference = 0.2

top_poses = trained_crf.tag(sent, flatten=True)
for poses, score in top_poses:
    print('\nscore = {}'.format(score))
    for pos in poses:
        print(pos)


score = 8.741956263158434
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가', 'Josa')
('나오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Verb')
('을까', 'Eomi')

score = 8.700471735794014
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가', 'Josa')
('나오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Verb')
('을', 'Eomi')
('끄', 'Verb')
('아', 'Eomi')

score = 8.631769824885843
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가', 'Josa')
('나오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Verb')
('을', 'Eomi')
('까', 'Eomi')

score = 8.506254643400048
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가나', 'Noun')
('오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Verb')
('을', 'Eomi')
('끄', 'Verb')
('아', 'Eomi')

score = 8.437552732491877
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가나', 'Noun')
('오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Verb')
('을', 'Eomi')
('까', 'Eomi')


In [9]:
poses, score = trained_crf.tag(sent, flatten=False, beam_size=1)[0]
print(score)
for pos in poses:
    print(pos)

8.700471735794014
('아이돌룸/Noun', 0, 4, 1)
('에/Josa', 4, 5, 0.8190128120533081)
('아이오아이/Noun', 5, 10, 1)
('가/Josa', 10, 11, 0.41837169731542345)
('나오/Verb + ㄹ/Eomi', 11, 13, 0.6485539073324389)
('수/Noun', 13, 14, 0.18038937990949483)
('있/Verb', 14, 15, 0.4063281916380028)
('을/Eomi', 15, 16, 0.3606708588333233)
('끄/Verb + 아/Eomi', 16, 17, 0.42202906523364403)
