## Check corpus

In [1]:
import sys
sys.path.append('../')

import crf_postagger
from crf_postagger import Corpus

corpus_path = '../data/sejong_simpletag.txt'
corpus = Corpus(corpus_path, num_sent=3)

for sentence in corpus:
    print(sentence)

[['뭐', 'Noun'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['ㅏ', 'Eomi']]
[['지하철', 'Noun']]
[['기차', 'Noun']]
[['아침', 'Noun'], ['에', 'Josa'], ['몇', 'Determiner'], ['시', 'Noun'], ['에', 'Josa'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['는데', 'Eomi']]


## Training

## Potential function

In [2]:
from crf_postagger import HMMStyleFeatureTransformer
from crf_postagger import TrigramFeatureTransformer


# sentence_to_xy = HMMStyleFeatureTransformer()
sentence_to_xy = TrigramFeatureTransformer()

sentence = [['뭐', 'Noun'], ['타', 'Verb'], ['고', 'Eomi'], ['가', 'Verb'], ['ㅏ', 'Eomi']]
features, tags = sentence_to_xy(sentence)

from pprint import pprint
pprint(features)
print(tags)

[['x[0]=뭐',
  'x[0]=뭐, y[-1]=BOS',
  'x[-1:0]=BOS-뭐',
  'x[0:1]=뭐-타',
  'x[0:1]=뭐-타, y[1]=Verb',
  'x[-1,1]=BOS-타',
  'x[-1:1]=BOS-뭐-타'],
 ['x[0]=타',
  'x[0]=타, y[-1]=Noun',
  'x[-1:0]=뭐-타',
  'x[0:1]=타-고',
  'x[0:1]=타-고, y[1]=Eomi',
  'x[-1,1]=뭐-고',
  'x[-1:1]=뭐-타-고'],
 ['x[0]=고',
  'x[0]=고, y[-1]=Verb',
  'x[-1:0]=타-고',
  'x[0:1]=고-가',
  'x[0:1]=고-가, y[1]=Verb',
  'x[-1,1]=타-가',
  'x[-1:1]=타-고-가'],
 ['x[0]=가',
  'x[0]=가, y[-1]=Eomi',
  'x[-1:0]=고-가',
  'x[0:1]=가-ㅏ',
  'x[0:1]=가-ㅏ, y[1]=Eomi',
  'x[-1,1]=고-ㅏ',
  'x[-1:1]=고-가-ㅏ'],
 ['x[0]=ㅏ',
  'x[0]=ㅏ, y[-1]=Verb',
  'x[-1:0]=가-ㅏ',
  'x[0:1]=ㅏ-EOS',
  'x[0:1]=ㅏ-EOS, y[1]=EOS',
  'x[-1,1]=가-EOS',
  'x[-1:1]=가-ㅏ-EOS']]
('Noun', 'Verb', 'Eomi', 'Verb', 'Eomi')


### Training

In [3]:
from crf_postagger import Trainer

trainer = Trainer(
    Corpus(corpus_path, num_sent=-1),
    sentence_to_xy = sentence_to_xy,
    max_iter = max_iter,
    l1_cost = 0,
    verbose = verbose
)

model_path = '../models/trigram_crf_sejong_simple.json'
trainer._save_as_json(model_path)

## Tagging

### Loading trained model

In [4]:
from crf_postagger import TrigramParameter
from crf_postagger import TrigramTagger

model_path = '../models/trigram_crf_sejong_simple.json'
trained_crf = TrigramTagger(
    TrigramParameter(model_path)
)

### Evaluating

In [5]:
candidates = [
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Noun')],
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Noun')],
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Verb'), ('ㅏ', 'Eomi')],
    [('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Verb'), ('ㅏ', 'Eomi')]
]

for sent in candidates:
    print('\n{}'.format(sent))
    print(trained_crf.score(sent, debug=False))


[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Noun')]
27.858939

[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Noun')]
-3.6017170000000003

[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Eomi'), ('가', 'Verb'), ('ㅏ', 'Eomi')]
54.225075

[('뭐', 'Noun'), ('타', 'Verb'), ('고', 'Noun'), ('가', 'Verb'), ('ㅏ', 'Eomi')]
23.032051000000003


### Decoding

In [6]:
sent = '주간아이돌에서아이오아이가소개했던오늘의날씨'

paths = trained_crf.tag(sent, k=3)
for path, score in paths:
    print('\nscore = {}'.format(score))
    for pos in path:
        print(pos)


score = 140.8013020000001
('주간', 'Noun', 0, 2)
('아이돌', 'Noun', 2, 5)
('에', 'Josa', 5, 6)
('서', 'Verb', 6, 7)
('아', 'Eomi', 7, 8)
('이오', 'Noun', 8, 10)
('아이', 'Noun', 10, 12)
('가', 'Josa', 12, 13)
('소개', 'Noun', 13, 15)
('하 + 았', 'Verb + Eomi', 15, 16)
('던', 'Eomi', 16, 17)
('오늘', 'Noun', 17, 19)
('의', 'Josa', 19, 20)
('날씨', 'Noun', 20, 22)

score = 140.8013020000001
('주간', 'Noun', 0, 2)
('아이돌', 'Noun', 2, 5)
('에', 'Josa', 5, 6)
('서', 'Verb', 6, 7)
('아', 'Eomi', 7, 8)
('이오', 'Noun', 8, 10)
('아이', 'Noun', 10, 12)
('가', 'Josa', 12, 13)
('소개', 'Noun', 13, 15)
('하 + 았', 'Adjective + Eomi', 15, 16)
('던', 'Eomi', 16, 17)
('오늘', 'Noun', 17, 19)
('의', 'Josa', 19, 20)
('날씨', 'Noun', 20, 22)

score = 137.47260700000007
('주간', 'Noun', 0, 2)
('아이돌', 'Noun', 2, 5)
('에', 'Josa', 5, 6)
('서', 'Verb', 6, 7)
('아', 'Eomi', 7, 8)
('이오', 'Noun', 8, 10)
('아이', 'Noun', 10, 12)
('가', 'Josa', 12, 13)
('소개', 'Noun', 13, 15)
('했던', 'Unk', 15, 17)
('오늘', 'Noun', 17, 19)
('의', 'Josa', 19, 20)
('날씨', 'Noun', 20, 22

### User dictionary with preference

In [7]:
trained_crf.add_user_dictionary('Noun', {'아이오아이':20, '주간아이돌':30})

paths = trained_crf.tag(sent, k=1)
for path, score in paths:
    print('\nscore = {}'.format(score))
    for pos in path:
        print(pos)


score = 161.9815650000001
('주간아이돌', 'Noun', 0, 5)
('에서', 'Josa', 5, 7)
('아이오아이', 'Noun', 7, 12)
('가', 'Josa', 12, 13)
('소개', 'Noun', 13, 15)
('하 + 았', 'Verb + Eomi', 15, 16)
('던', 'Eomi', 16, 17)
('오늘', 'Noun', 17, 19)
('의', 'Josa', 19, 20)
('날씨', 'Noun', 20, 22)
