In [1]:
import sys
sys.path.append('../')
import hmm_postagger

In [2]:
from hmm_postagger import Corpus
from hmm_postagger import CorpusTrainer

# set file
paths = [
    ('../data/sejong_simpletag.txt', '../models/sejong_simple_hmm.json'),
    ('../data/sejong_fulltag.txt', '../models/sejong_full_hmm.json')
]

for sejong_path, model_path in paths:

    print('\n\nTraining with {}'.format(sejong_path))

    # check corpus
    print('Corpus ... ')
    corpus = Corpus(sejong_path)
    for i, sent in enumerate(corpus):
        if i > 3:
            break
        print(sent)

    # train model
    trainer = CorpusTrainer(min_count_tag=5, min_count_word=1, verbose=True)
    trainer.train(corpus, model_path)

    print('num pos = {}, num transition = {}'.format(
        len(trainer.pos2words_), len(trainer.transition_)))



Training with ../data/sejong_simpletag.txt
Corpus ... 
[['뭐', 'Noun'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['ㅏ', 'Eomi']]
[['지하철', 'Noun']]
[['기차', 'Noun']]
[['아침', 'Noun'], ['에', 'Josa'], ['몇', 'Determiner'], ['시', 'Noun'], ['에', 'Josa'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['는데', 'Eomi']]
training observation/transition prob from 1046918 sents was done
num pos = 8, num transition = 72


Training with ../data/sejong_fulltag.txt
Corpus ... 
[['뭐', 'NP'], ['타', 'VV'], ['고', 'EC'], ['가', 'VV'], ['ㅏ', 'EF']]
[['지하철', 'NNG']]
[['기차', 'NNG']]
[['아침', 'NNG'], ['에', 'JKB'], ['몇', 'MM'], ['시', 'NNB'], ['에', 'JKB'], ['타', 'VV'], ['고', 'EC'], ['가', 'VV'], ['는데', 'EF']]
training observation/transition prob from 1046920 sents was done
num pos = 33, num transition = 946


In [3]:
import json
with open('../models/sejong_simple_hmm.json', encoding='utf-8') as f:
    model = json.load(f)

In [4]:
model.keys()

dict_keys(['emission', 'transition', 'begin'])

In [5]:
model['transition']

{'Adjective Adjective': -10.720074389437459,
 'Adjective Adverb': -13.285023746898995,
 'Adjective Determiner': -13.285023746898995,
 'Adjective EOS': -11.270120726356732,
 'Adjective Eomi': -0.00018019074769268445,
 'Adjective Exclamation': -12.591876566339051,
 'Adjective Josa': -9.343441939229304,
 'Adjective Noun': -10.314609281329295,
 'Adjective Verb': -10.933648489735518,
 'Adverb Adjective': -2.492732590632981,
 'Adverb Adverb': -2.7345515141136674,
 'Adverb Determiner': -3.0468551060686653,
 'Adverb EOS': -3.750064143153076,
 'Adverb Eomi': -9.58671989918925,
 'Adverb Exclamation': -5.353630947465341,
 'Adverb Josa': -3.7888507467565016,
 'Adverb Noun': -0.6943158483515862,
 'Adverb Verb': -1.3683300419640723,
 'Determiner Adjective': -4.160227320793998,
 'Determiner Adverb': -5.258470196823995,
 'Determiner Determiner': -4.009433989966624,
 'Determiner EOS': -5.593613676744334,
 'Determiner Eomi': -10.271879417802976,
 'Determiner Exclamation': -6.441428066068641,
 'Determine