In [1]:
import sys
sys.path.append('../')

import crf_postagger
from crf_postagger import Corpus
from pprint import pprint

corpus_path = '../data/sejong_corpus_lr_sepxsv.txt'
corpus = Corpus(corpus_path, num_sent=1)

for sentence in corpus:
    pprint(sentence)

[['프랑스', 'Noun'],
 ['의', 'Josa'],
 ['세계적', 'Noun'],
 ['이', 'Adjective'],
 ['ㄴ', 'Eomi'],
 ['의상', 'Noun'],
 ['디자이너', 'Noun'],
 ['엠마누엘', 'Noun'],
 ['웅가로', 'Noun'],
 ['가', 'Josa'],
 ['실내', 'Noun'],
 ['장식용', 'Noun'],
 ['직물', 'Noun'],
 ['디자이너', 'Noun'],
 ['로', 'Josa'],
 ['나서', 'Verb'],
 ['었다', 'Eomi']]


In [2]:
from crf_postagger.trigram import TrigramFeatureTransformer

sentence_to_xy = TrigramFeatureTransformer()

sentence = [['뭐', 'Noun'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['ㅏ', 'Eomi']]
features, tags = sentence_to_xy(sentence)

from pprint import pprint
pprint(features)
print(tags)

[['x[0]=뭐',
  'x[0]=뭐, y[-1]=BOS',
  'x[-1:0]=BOS-뭐',
  'x[0:1]=뭐-타',
  'x[0:1]=뭐-타, y[1]=Verb',
  'x[-1,1]=BOS-타',
  'x[-1:1]=BOS-뭐-타'],
 ['x[0]=타',
  'x[0]=타, y[-1]=Noun',
  'x[-1:0]=뭐-타',
  'x[0:1]=타-고',
  'x[0:1]=타-고, y[1]=Eomi',
  'x[-1,1]=뭐-고',
  'x[-1:1]=뭐-타-고'],
 ['x[0]=고',
  'x[0]=고, y[-1]=Verb',
  'x[-1:0]=타-고',
  'x[0:1]=고-가',
  'x[0:1]=고-가, y[1]=Verb',
  'x[-1,1]=타-가',
  'x[-1:1]=타-고-가'],
 ['x[0]=가',
  'x[0]=가, y[-1]=Eomi',
  'x[-1:0]=고-가',
  'x[0:1]=가-ㅏ',
  'x[0:1]=가-ㅏ, y[1]=Eomi',
  'x[-1,1]=고-ㅏ',
  'x[-1:1]=고-가-ㅏ'],
 ['x[0]=ㅏ',
  'x[0]=ㅏ, y[-1]=Verb',
  'x[-1:0]=가-ㅏ',
  'x[0:1]=ㅏ-EOS',
  'x[0:1]=ㅏ-EOS, y[1]=EOS',
  'x[-1,1]=가-EOS',
  'x[-1:1]=가-ㅏ-EOS']]
('Noun', 'Verb', 'Eomi', 'Verb', 'Eomi')


In [6]:
from crf_postagger import Trainer

trainer = Trainer(    
    sentence_to_xy = sentence_to_xy,
    max_iter = 30,
    l1_cost = 0,
    verbose = False
)

model_path = '../models/trigram_crf_sejong_lr_sepxsv.json'

trainer.train(
    Corpus(corpus_path, num_sent=-1),
    model_path
)

In [7]:
from crf_postagger.trigram import TrigramTagger
from crf_postagger.trigram import TrigramParameter

preanalyzed_eojeols = {
    '해쪄': (('하', '아쪄', 'Verb', 'Eomi'),)
}

trained_crf = TrigramTagger(
    TrigramParameter(
        model_path,
        preanalyzed_eojeols = preanalyzed_eojeols
    )
)

In [8]:
sentence = [['뭐', 'Noun'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['ㅏ', 'Eomi']]
trained_crf.evaluate(sentence)

3.0270748263528033

In [9]:
trained_crf._noun_preference = 0.01
trained_crf._unknown_penalty=-0.1

In [10]:
trained_crf.tag('오늘저녁에운동을aa', flatten=False, guess_tag=True)

[([('오늘/Noun', 0, 2, 0.02939046921384876),
   ('저녁/Noun', 2, 4, 0.03495215180293302),
   ('에/Josa', 4, 5, 0.7988212775958619),
   ('운동/Noun', 5, 7, 0.04151176192894883),
   ('을/Josa', 7, 8, 0.4941871877320825),
   ('aa/Noun', 8, 10, -0.1)],
  3.0548825368173684),
 ([('오늘/Noun', 0, 2, 0.02939046921384876),
   ('저녁/Noun', 2, 4, 0.03495215180293302),
   ('에/Josa', 4, 5, 0.7988212775958619),
   ('운동/Noun', 5, 7, 0.04151176192894883),
   ('을/Josa', 7, 8, 0.4941871877320825),
   ('aa/Adjective', 8, 10, -0.1)],
  2.8719620723936874),
 ([('오늘/Noun', 0, 2, 0.02939046921384876),
   ('저녁/Noun', 2, 4, 0.03495215180293302),
   ('에/Josa', 4, 5, 0.7988212775958619),
   ('운동/Noun', 5, 7, 0.04151176192894883),
   ('을/Josa', 7, 8, 0.4941871877320825),
   ('aa/Verb', 8, 10, -0.1)],
  2.8323334795148774),
 ([('오늘저녁/Noun', 0, 4, -0.1),
   ('에/Josa', 4, 5, 0.7988212775958619),
   ('운동/Noun', 5, 7, 0.04151176192894883),
   ('을/Josa', 7, 8, 0.4941871877320825),
   ('aa/Noun', 8, 10, -0.1)],
  2.83150840274065

In [11]:
trained_crf.add_user_dictionary('Eomi', {'아쪄':1, '아써':1})
trained_crf.tag('머리쿵해쪄')

[([('머리', 'Noun'), ('쿵', 'Adverb'), ('하', 'Verb'), ('아쪄', 'Eomi')],
  2.2417971454993966),
 ([('머', 'Exclamation'),
   ('리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Verb'),
   ('아쪄', 'Eomi')],
  1.6144674032983075),
 ([('머리', 'Noun'),
   ('쿵', 'Adverb'),
   ('하', 'Verb'),
   ('아', 'Eomi'),
   ('찌', 'Verb'),
   ('어', 'Eomi')],
  1.5775719515293989),
 ([('머리', 'Noun'), ('쿵하', 'Verb'), ('아쪄', 'Eomi')], 1.539755672532912),
 ([('머리쿵', 'Unk'), ('하', 'Verb'), ('아쪄', 'Eomi')], 1.5391687714592723)]

In [12]:
trained_crf.tag('머리쿵해쪄', flatten=False, beam_size=2)

[([('머리/Noun', 0, 2, 0.044167659920634116),
   ('쿵/Adverb', 2, 3, 0.003490086780958086),
   ('하/Verb + 아쪄/Eomi', 3, 5, 1.7945860985687603)],
  2.2417971454993966),
 ([('머리/Noun', 0, 2, 0.044167659920634116),
   ('쿵/Adverb', 2, 3, 0.003490086780958086),
   ('하/Verb + 아/Eomi', 3, 4, 1.2142712310384374),
   ('찌/Verb + 어/Eomi', 4, 5, 0.566420316555797)],
  1.5775719515293989)]

In [13]:
candidates = [
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Noun')],
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Noun')],
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Verb'), ('ㅏ', 'Eomi')],
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Verb'), ('ㅏ', 'Eomi')]
]

for sent in candidates:
    print('\n{}'.format(sent))
    print(trained_crf.evaluate(sent, debug=False))


[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Noun')]
1.5817504724400573

[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Noun')]
-0.7660424801447979

[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Verb'), ('ㅏ', 'Eomi')]
3.0270748263528033

[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Verb'), ('ㅏ', 'Eomi')]
0.6979560697959537


In [14]:
trained_crf.add_user_dictionary('Noun', {'아이오아이':1, '아이돌룸':1})
trained_crf.add_user_dictionary('Verb', {'나오':0.5})
sent = '아이돌룸에아이오아이가나올수있을까'

trained_crf._a_syllable_penalty = -0.3
trained_crf._noun_preference = 0.2

top_poses = trained_crf.tag(sent, flatten=True)
for poses, score in top_poses:
    print('\nscore = {}'.format(score))
    for pos in poses:
        print(pos)


score = 8.631863016811264
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가', 'Josa')
('나오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Verb')
('을', 'Eomi')
('끄', 'Verb')
('아', 'Eomi')

score = 8.620568685716137
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가', 'Josa')
('나오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Verb')
('을까', 'Eomi')

score = 8.490259180745435
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가', 'Josa')
('나오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Verb')
('을', 'Eomi')
('끄', 'Verb')
('아', 'Eomi')

score = 8.348945183077209
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가', 'Josa')
('나오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Adjective')
('을', 'Eomi')
('끄', 'Verb')
('아', 'Eomi')

score = 8.309264699122824
('아이돌룸', 'Noun')
('에', 'Josa')
('아이오아이', 'Noun')
('가나', 'Noun')
('오', 'Verb')
('ㄹ', 'Eomi')
('수', 'Noun')
('있', 'Verb')
('을', 'Eomi')
('끄', 'Verb')
('아', 'Eomi')


In [15]:
poses, score = trained_crf.tag(sent, flatten=False, beam_size=1)[0]
print(score)
for pos in poses:
    print(pos)

8.631863016811264
('아이돌룸/Noun', 0, 4, 1)
('에/Josa', 4, 5, 0.7988212775958619)
('아이오아이/Noun', 5, 10, 1)
('가/Josa', 10, 11, 0.49605081451643446)
('나오/Verb + ㄹ/Eomi', 11, 13, 0.8343134145802069)
('수/Noun', 13, 14, 0.21230885221277132)
('있/Verb', 14, 15, 0.43585965299758406)
('을/Eomi', 15, 16, 0.2617042416834795)
('끄/Verb + 아/Eomi', 16, 17, 0.4272818702989071)
