In [1]:
import sys
sys.path.append('../')

import crf_postagger
from crf_postagger import Corpus
from pprint import pprint

corpus_path = '../data/sejong_corpus_lr_sepxsv.txt'
corpus = Corpus(corpus_path, num_sent=1)

for sentence in corpus:
    pprint(sentence)

[['프랑스', 'Noun'],
 ['의', 'Josa'],
 ['세계적', 'Noun'],
 ['이', 'Adjective'],
 ['ㄴ', 'Eomi'],
 ['의상', 'Noun'],
 ['디자이너', 'Noun'],
 ['엠마누엘', 'Noun'],
 ['웅가로', 'Noun'],
 ['가', 'Josa'],
 ['실내', 'Noun'],
 ['장식용', 'Noun'],
 ['직물', 'Noun'],
 ['디자이너', 'Noun'],
 ['로', 'Josa'],
 ['나서', 'Verb'],
 ['었다', 'Eomi']]


In [2]:
from crf_postagger.hmm_style import HMMStyleFeatureTransformer

sentence_to_xy = HMMStyleFeatureTransformer()

sentence = [['뭐', 'Noun'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['ㅏ', 'Eomi']]
features, tags = sentence_to_xy(sentence)

pprint(features)
print(tags)

[['x[0]=뭐'], ['x[0]=타'], ['x[0]=고'], ['x[0]=가'], ['x[0]=ㅏ']]
('Noun', 'Verb', 'Eomi', 'Verb', 'Eomi')


In [4]:
from crf_postagger import Trainer

trainer = Trainer(    
    sentence_to_xy = sentence_to_xy,
    max_iter = 30,
    l1_cost = 0,
    verbose = False
)

model_path = '../models/hmmstyle_crf_sejong_lr_sepxsv.json'

trainer.train(
    Corpus(corpus_path, num_sent=-1),
    model_path
)

In [5]:
from crf_postagger.hmm_style import HMMStyleTagger
from crf_postagger.hmm_style import HMMStyleParameter

preanalyzed_eojeols = {
    '해쪄': (('하', '아쪄', 'Verb', 'Eomi'),)
}

trained_crf = HMMStyleTagger(
    HMMStyleParameter(
        model_path,
        preanalyzed_eojeols = preanalyzed_eojeols
    )
)

In [6]:
sentence = [['뭐', 'Noun'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['ㅏ', 'Eomi']]
trained_crf.evaluate(sentence)

2.2604475458241815

In [7]:
morphtags, score = trained_crf.tag('머리쿵해쪄')
pprint(morphtags)
print(score)

[('머리', 'Noun'), ('쿵하', 'Verb'), ('아', 'Eomi'), ('찌', 'Verb'), ('어', 'Eomi')]
1.824542768296078


In [8]:
trained_crf.add_user_dictionary('Eomi', {'아쪄':1, '아써':1})
trained_crf.tag('머리쿵해쪄')

[[('머리', 'Noun'), ('쿵하', 'Verb'), ('아쪄', 'Eomi')], 2.464843399123463]

In [9]:
trained_crf.tag('머리쿵해쪄', flatten=False)

[[('머리/Noun', 0, 2, 0.04017745747461684),
  ('쿵하/Verb + 아쪄/Eomi', 2, 5, 1.0001009508245116)],
 2.464843399123463]