In [1]:
import sys
sys.path.append('../')
import hmm_postagger

In [2]:
from hmm_postagger import Corpus
from hmm_postagger import CorpusTrainer

# set file
paths = [
    ('../data/sejong_simpletag.txt', '../models/sejong_simple_hmm.json'),
    ('../data/sejong_fulltag.txt', '../models/sejong_full_hmm.json')
]

for sejong_path, model_path in paths:

    print('\n\nTraining with {}'.format(sejong_path))

    # check corpus
    print('Corpus ... ')
    corpus = Corpus(sejong_path)
    for i, sent in enumerate(corpus):
        if i > 3:
            break
        print(sent)

    # train model
    trainer = CorpusTrainer(min_count_tag=5, min_count_word=1, verbose=True)
    trainer.train(corpus, model_path)

    print('num pos = {}, num transition = {}'.format(
        len(trainer.pos2words_), len(trainer.transition_)))



Training with ../data/sejong_simpletag.txt
Corpus ... 
[['뭐', 'Noun'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['ㅏ', 'Eomi']]
[['지하철', 'Noun']]
[['기차', 'Noun']]
[['아침', 'Noun'], ['에', 'Josa'], ['몇', 'Determiner'], ['시', 'Noun'], ['에', 'Josa'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['는데', 'Eomi']]
training observation/transition prob from 1046918 sents was done
num pos = 8, num transition = 64


Training with ../data/sejong_fulltag.txt
Corpus ... 
[['뭐', 'NP'], ['타', 'VV'], ['고', 'EC'], ['가', 'VV'], ['ㅏ', 'EF']]
[['지하철', 'NNG']]
[['기차', 'NNG']]
[['아침', 'NNG'], ['에', 'JKB'], ['몇', 'MM'], ['시', 'NNB'], ['에', 'JKB'], ['타', 'VV'], ['고', 'EC'], ['가', 'VV'], ['는데', 'EF']]
training observation/transition prob from 1046920 sents was done
num pos = 33, num transition = 914


In [3]:
import json
with open('../models/sejong_simple_hmm.json', encoding='utf-8') as f:
    model = json.load(f)

In [5]:
model['transition']

{'Adjective Adjective': -10.720061641159432,
 'Adjective Adverb': -13.285010998620969,
 'Adjective Determiner': -13.285010998620969,
 'Adjective Eomi': -0.0001674424696662001,
 'Adjective Exclamation': -12.591863818061023,
 'Adjective Josa': -9.343429190951278,
 'Adjective Noun': -10.314596533051269,
 'Adjective Verb': -10.933635741457492,
 'Adverb Adjective': -2.4689354336654614,
 'Adverb Adverb': -2.710754357146148,
 'Adverb Determiner': -3.023057949101146,
 'Adverb Eomi': -9.562922742221732,
 'Adverb Exclamation': -5.329833790497822,
 'Adverb Josa': -3.765053589788982,
 'Adverb Noun': -0.6705186913840667,
 'Adverb Verb': -1.344532884996553,
 'Determiner Adjective': -4.156498823539567,
 'Determiner Adverb': -5.254741699569564,
 'Determiner Determiner': -4.005705492712193,
 'Determiner Eomi': -10.268150920548544,
 'Determiner Exclamation': -6.43769956881421,
 'Determiner Josa': -6.273013008409892,
 'Determiner Noun': -0.049007486621678774,
 'Determiner Verb': -5.257515626452289,
 'Eom