In [1]:
import sys
sys.path.append('../')
import hmm_postagger

In [2]:
from hmm_postagger import Corpus
from hmm_postagger import CorpusTrainer

# set file
paths = [
    ('../data/sejong_corpus_lr_sepxsv.txt', '../models/sejong_lr_sepxsv_hmm.json'),
    ('../data/sejong_corpus_lr_unsepxsv.txt', '../models/sejong_lr_unsepxsv_hmm.json')
]

for sejong_path, model_path in paths:

    print('\n\nTraining with {}'.format(sejong_path))

    # check corpus
    print('Corpus ... ')
    corpus = Corpus(sejong_path)
    for i, sent in enumerate(corpus):
        print(sent)
        break

    # train model
    trainer = CorpusTrainer(min_count_tag=5, min_count_word=1, verbose=True)
    trainer.train(corpus, model_path)

    print('num pos = {}, num transition = {}'.format(
        len(trainer.emission_), len(trainer.transition_)))



Training with ../data/sejong_corpus_lr_sepxsv.txt
Corpus ... 
[['프랑스', 'Noun'], ['의', 'Josa'], ['세계적', 'Noun'], ['이', 'Adjective'], ['ㄴ', 'Eomi'], ['의상', 'Noun'], ['디자이너', 'Noun'], ['엠마누엘', 'Noun'], ['웅가로', 'Noun'], ['가', 'Josa'], ['실내', 'Noun'], ['장식용', 'Noun'], ['직물', 'Noun'], ['디자이너', 'Noun'], ['로', 'Josa'], ['나서', 'Verb'], ['었다', 'Eomi']]
training observation/transition prob from 1031957 sents was done
num pos = 11, num transition = 144


Training with ../data/sejong_corpus_lr_unsepxsv.txt
Corpus ... 
[['프랑스', 'Noun'], ['의', 'Josa'], ['세계적이', 'Adjective'], ['ㄴ', 'Eomi'], ['의상', 'Noun'], ['디자이너', 'Noun'], ['엠마누엘', 'Noun'], ['웅가로', 'Noun'], ['가', 'Josa'], ['실내', 'Noun'], ['장식용', 'Noun'], ['직물', 'Noun'], ['디자이너', 'Noun'], ['로', 'Josa'], ['나서', 'Verb'], ['었다', 'Eomi']]
training observation/transition prob from 1031957 sents was done
num pos = 11, num transition = 144


In [3]:
import json
with open(model_path, encoding='utf-8') as f:
    model = json.load(f)

In [4]:
model.keys()

dict_keys(['emission', 'transition'])

In [5]:
model['emission'].keys()

dict_keys(['Noun', 'Josa', 'Adjective', 'Eomi', 'Verb', 'Pronoun', 'Adverb', 'Determiner', 'Number', 'Unk', 'Exclamation'])

In [6]:
list(model['emission']['Noun'].items())[:50]

[('프랑스', -7.794385810258628),
 ('의상', -9.641646878244101),
 ('디자이너', -11.150280091556237),
 ('엠마누엘', -14.747592352144682),
 ('웅가로', -14.342127244036519),
 ('실내', -9.842317573706254),
 ('장식용', -11.885391471215215),
 ('직물', -10.825619015863369),
 ('침실', -10.653247789922581),
 ('식당', -8.800863698878091),
 ('욕실', -10.353143197472244),
 ('갖가지', -9.647725924320484),
 ('직물제품', -15.440739532704628),
 ('최근', -7.137482411851687),
 ('파리', -9.020744604557486),
 ('갤러리', -11.612098136215533),
 ('라파예트백화점', -15.440739532704628),
 ('색', -8.727783332027558),
 ('이름', -6.8632041122822285),
 ('전시회', -9.927310786539646),
 ('목욕가운', -14.054445171584737),
 ('탁자보', -14.747592352144682),
 ('냅킨', -12.039542151042472),
 ('앞치마', -11.829821620060404),
 ('작품들', -9.546336698439777),
 ('것', -3.634995211177466),
 ('남미풍', -14.747592352144682),
 ('원색끼리', -15.440739532704628),
 ('조화', -8.940952492048774),
 ('수채화', -11.944231971238148),
 ('배색', -11.415387841969478),
 ('등', -4.943759402768738),
 ('분위기', -7.712323752863585),


In [7]:
model['transition']

{'Adjective Adjective': -6.504927738257543,
 'Adjective Adverb': -6.824663182523231,
 'Adjective Determiner': -7.848160300445497,
 'Adjective EOS': -5.333859290136682,
 'Adjective Eomi': -0.019305546822270488,
 'Adjective Exclamation': -9.293925215163716,
 'Adjective Josa': -9.11487698371473,
 'Adjective Noun': -5.4235415316409235,
 'Adjective Number': -9.059085624086315,
 'Adjective Pronoun': -7.153859051667445,
 'Adjective Unk': -6.436779104895871,
 'Adjective Verb': -5.470626282498171,
 'Adverb Adjective': -1.9937441713141584,
 'Adverb Adverb': -2.7840413713537475,
 'Adverb Determiner': -3.0614116823793585,
 'Adverb EOS': -3.7721295123576057,
 'Adverb Eomi': -8.906638603151944,
 'Adverb Exclamation': -5.494705406345194,
 'Adverb Josa': -3.766982071909051,
 'Adverb Noun': -1.0846174981773558,
 'Adverb Number': -5.249938047402242,
 'Adverb Pronoun': -3.0317346249317048,
 'Adverb Unk': -7.269029813751147,
 'Adverb Verb': -1.1627372099237359,
 'BOS Adjective': -2.8106283021621152,
 'BOS