In [1]:
import sys
sys.path.append('../')

import shortestpath
import hmm_pos_tagger

## simple tokenizer

In [2]:
from hmm_pos_tagger import Dictionary

pos2words = {
    'Noun': set('아이 아이오 아이오아이 청하 출신 청'.split()),
    'Josa': set('은 는 이 가 의 를 을'.split()),
    'Verb': set('청하 이 있 하 했 입'.split()),
    'Eomi': set('다 었다 는 니다'.split())
}

dictionary = Dictionary(pos2words)

In [3]:
from hmm_pos_tagger import TrainedHMM
from hmm_pos_tagger import WordSequenceGraph

transition = {
    ('Noun', 'Josa'): 0.7,
    ('Noun', 'Noun'): 0.3,
    ('Verb', 'Eomi'): 0.5,
    ('Verb', 'Noun'): 0.5,
    ('Verb', 'Josa'): -0.1,
}
generation = {
    'Noun': {
        '아이오아이': 0.5,
        '청하': 0.2,
    }
}

hmm_model = TrainedHMM(transition, generation)
lookuper = WordSequenceGraph(dictionary, hmm_model)

In [4]:
sentence = '청하는 아이오아이의 출신입니다'
encoded_graph, idx2node, graph, sent = lookuper.as_graph(sentence)

print('Look-up words')
for i, words in enumerate(sent):
    print(i, words)

print('\nEdges')
for edge in graph:
    print(edge)

Look-up words
0 [('청', 'Noun', 0, 1), ('청하', 'Noun', 0, 2), ('청하', 'Verb', 0, 2)]
1 [('하', 'Verb', 1, 2)]
2 [('는', 'Josa', 2, 3), ('는', 'Eomi', 2, 3)]
3 [('아이', 'Noun', 3, 5), ('아이오', 'Noun', 3, 6), ('아이오아이', 'Noun', 3, 8)]
4 [('이', 'Josa', 4, 5), ('이', 'Verb', 4, 5)]
5 []
6 [('아이', 'Noun', 6, 8)]
7 [('이', 'Josa', 7, 8), ('이', 'Verb', 7, 8)]
8 [('의', 'Josa', 8, 9)]
9 [('출신', 'Noun', 9, 11)]
10 []
11 [('입', 'Verb', 11, 12)]
12 [('니다', 'Eomi', 12, 14)]
13 [('다', 'Eomi', 13, 14)]
14 [('EOS', 'EOS', 15, 15)]

Edges
(('BOS', 'BOS', 0, 0), ('청', 'Noun', 0, 1), -0.00030000000000000003)
(('청', 'Noun', 0, 1), ('하', 'Verb', 1, 2), -0.00030000000000000003)
(('BOS', 'BOS', 0, 0), ('청하', 'Noun', 0, 2), -0.2003)
(('BOS', 'BOS', 0, 0), ('청하', 'Verb', 0, 2), -0.00030000000000000003)
(('청하', 'Noun', 0, 2), ('는', 'Josa', 2, 3), -0.9002999999999999)
(('청하', 'Noun', 0, 2), ('는', 'Eomi', 2, 3), -0.20029999999999998)
(('청하', 'Verb', 0, 2), ('는', 'Josa', 2, 3), 0.0997)
(('청하', 'Verb', 0, 2), ('는', 'Eomi', 2,

In [5]:
from shortestpath import list_to_dict_graph

g_dict = list_to_dict_graph(graph)
g_dict

{('BOS', 'BOS', 0, 0): {('청', 'Noun', 0, 1): -0.00030000000000000003,
  ('청하', 'Noun', 0, 2): -0.2003,
  ('청하', 'Verb', 0, 2): -0.00030000000000000003},
 ('는', 'Eomi', 2, 3): {('아이', 'Noun', 3, 5): -0.00030000000000000003,
  ('아이오', 'Noun', 3, 6): -0.00030000000000000003,
  ('아이오아이', 'Noun', 3, 8): -0.5003},
 ('는', 'Josa', 2, 3): {('아이', 'Noun', 3, 5): -0.00030000000000000003,
  ('아이오', 'Noun', 3, 6): -0.00030000000000000003,
  ('아이오아이', 'Noun', 3, 8): -0.5003},
 ('니다', 'Eomi', 12, 14): {('EOS', 'EOS', 15, 15): -0.00030000000000000003},
 ('다', 'Eomi', 13, 14): {('EOS', 'EOS', 15, 15): -0.00030000000000000003},
 ('아이', 'Noun', 3, 5): {('오', 'Unk', 5, 6): -0.00030000000000000003},
 ('아이', 'Noun', 6, 8): {('의', 'Josa', 8, 9): -0.7002999999999999},
 ('아이오', 'Noun', 3, 6): {('아이', 'Noun', 6, 8): -0.30029999999999996},
 ('아이오아이', 'Noun', 3, 8): {('의', 'Josa', 8, 9): -1.2003},
 ('오', 'Unk', 5, 6): {('아이', 'Noun', 6, 8): -0.00030000000000000003},
 ('의', 'Josa', 8, 9): {('출신', 'Noun', 9, 11): -

In [6]:
from shortestpath import ford

# bos = 0
# eos = 19
bos = ('BOS', 'BOS', 0, 0)
eos = ('EOS', 'EOS', 15, 15)
ford(g_dict, bos, eos, debug=True)

cost[('BOS', 'BOS', 0, 0) -> ('청', 'Noun', 0, 1)] = 24.193399999999997 -> -0.00030000000000000003
cost[('BOS', 'BOS', 0, 0) -> ('청하', 'Noun', 0, 2)] = 24.193399999999997 -> -0.2003
cost[('BOS', 'BOS', 0, 0) -> ('청하', 'Verb', 0, 2)] = 24.193399999999997 -> -0.00030000000000000003
cost[('청', 'Noun', 0, 1) -> ('하', 'Verb', 1, 2)] = 24.193399999999997 -> -0.0006000000000000001
cost[('청하', 'Noun', 0, 2) -> ('는', 'Josa', 2, 3)] = 24.193399999999997 -> -1.1005999999999998
cost[('청하', 'Noun', 0, 2) -> ('는', 'Eomi', 2, 3)] = 24.193399999999997 -> -0.40059999999999996
cost[('청하', 'Verb', 0, 2) -> ('는', 'Eomi', 2, 3)] = -0.40059999999999996 -> -0.5005999999999999
cost[('하', 'Verb', 1, 2) -> ('는', 'Eomi', 2, 3)] = -0.5005999999999999 -> -0.5009
cost[('는', 'Josa', 2, 3) -> ('아이', 'Noun', 3, 5)] = 24.193399999999997 -> -1.1008999999999998
cost[('는', 'Josa', 2, 3) -> ('아이오', 'Noun', 3, 6)] = 24.193399999999997 -> -1.1008999999999998
cost[('는', 'Josa', 2, 3) -> ('아이오아이', 'Noun', 3, 8)] = 24.1933999999

{'cost': -3.3024000000000004,
 'paths': [[('BOS', 'BOS', 0, 0),
   ('청하', 'Noun', 0, 2),
   ('는', 'Josa', 2, 3),
   ('아이오아이', 'Noun', 3, 8),
   ('의', 'Josa', 8, 9),
   ('출신', 'Noun', 9, 11),
   ('입', 'Verb', 11, 12),
   ('니다', 'Eomi', 12, 14),
   ('EOS', 'EOS', 15, 15)]]}