In [1]:
from konlpy.tag import Twitter, Kkma, Hannanum
from config import sentence_kkma_fname, sentence_hannanum_fname, sentence_twitter_fname
from config import sentence_fname, sentence_tagged_fname
import sys

In [25]:
if False:
    output_fnames = [sentence_twitter_fname, sentence_kkma_fname, sentence_hannanum_fname]
    taggers = [Twitter(), Kkma(), Hannanum()]

    for output_fname, tagger in zip(output_fnames, taggers):
        with open(sentence_fname, encoding='utf-8') as fi:
            with open(output_fname, 'w', encoding='utf-8') as fo:
                for i, doc in enumerate(fi):
                    doc = '  '.join(['+'.join(['%s/%s' % (t[0], t[1]) for t in tagger.pos(eojeol)]) for eojeol in doc.split()])
                    fo.write('%s\n' % doc)
                    if (i+1) % 1000 == 0:
                        sys.stdout.write('\r%s ... %d' % (output_fname, (i+1)))
        sys.stdout.write('\r%s was done\n' % output_fname)/

../data/sentences_twitter.txt was done
../data/sentences_kkma.txt was done
../data/sentences_hannanum.txt was done


In [2]:
def load(fname, tagged=True):
    split = lambda doc: doc.replace('//','').strip().split()
    with open(fname, encoding='utf-8') as f:
        docs = [[eojeol.split('+') if tagged else eojeol for eojeol in split(doc)] for doc in f]
    return docs

answers = load(sentence_tagged_fname)
sentences = load(sentence_fname, False)

len(answers), len(sentences)

(1054566, 1054566)

In [43]:
from collections import Counter

answer_noun_counter = Counter((word.split('/')[0] for doc in answers for eojeol in doc for word in eojeol if '/N' in word and len(word.split('/')[0]) > 1 ) )
answer_noun_counter = {word:freq for word, freq in answer_noun_counter.items() if freq >= 10}

sorted(answer_noun_counter.items(), key=lambda x:x[1], reverse=True)[:20]

[('사람', 54901),
 ('우리', 42698),
 ('생각', 31221),
 ('때문', 28192),
 ('그것', 25966),
 ('사회', 21304),
 ('문제', 18652),
 ('경우', 17210),
 ('하나', 16427),
 ('그녀', 15750),
 ('자신', 15676),
 ('시간', 14118),
 ('자기', 13577),
 ('아이', 12964),
 ('시작', 12703),
 ('소리', 12399),
 ('세계', 12343),
 ('정도', 12121),
 ('인간', 11993),
 ('한국', 11751)]

In [39]:
with open(sentence_tagged_fname) as f:
    for _ in range(3):
        print(next(f).strip())
        
answers[2]

…/UNC 삼/NR 단/NNB 사/NR 단/NNB+이/VCP+면/EC 사/NR 단/NNB+,/SP
어/IC+,/SP
거기/NP 있/VA+을/ETM 때/NNG+,/SP


[['거기/NP'], ['있/VA', '을/ETM'], ['때/NNG', ',/SP']]

In [40]:
with open(sentence_fname) as f:
    for _ in range(3):
        print(next(f).strip())

sentences[2]

… 삼 단 사 단이면 사 단,
어::,
거기 있을 때::,


['거기', '있을', '때::,']

In [27]:
def to_lr(e, w, t):
    from hangle import decompose, compose, jaum_begin, jaum_end
    tag = t[0][0]
    i = 0
    n = len(t)
    for i_, ti in enumerate(t):
        if t[0][0] == 'N' and ti[0] == 'V':
            break
        if t[0][0] == 'V' and (ti == 'ETN' and len(w[i_]) == 1 and jaum_begin <= ord(w[i_][0]) <= jaum_end):
            tag = 'N'
            break
        if not (ti[0] == 'N' or ti == 'XSN' or ti[:2] == 'VV' or ti[:2] == 'VA' or ti == 'XR'):
            break
        i = i_
    lw = e[:len(''.join(w[:i+1]))]
    r = e[len(lw):]
    
    # 아빤 = 아빠/N + ㄴ/J
    # 갈꺼야 = 가/V + ㄹ/E + 꺼야/E
    if (t[i][0] == 'N' or t[i][0] == 'V') and (i+1 < n) and (jaum_begin <= ord(w[i+1][0]) <= jaum_end):
        last_l = decompose(lw[-1])
        l0 = lw[:-1] + compose(last_l[0], last_l[1], ' ')
        return lw, r, tag == 'N'

    # 가? = 가/V + ㅏ/E + ?/S
    # 먹었어 = 먹/V + 었어/E
    return lw, r, tag.replace('X','N') == 'N'

def eojeol_to_wt(eojeol):
    w = [e.split('/')[0] for e in eojeol]
    t = [e.split('/')[1] for e in eojeol]
    return w,t

In [28]:
to_lr('단이면', ['단', '이', '면'], ['NNB','VCP','EC'])

('단', '이면', True)

In [29]:
to_lr('단이면', *eojeol_to_wt(['단/NNB', '이/VCP', '면/EC']))

('단', '이면', True)

In [44]:
def accuracy(sentences, answers, taggeds):
    import sys
    print('#sentences= %d, #answers= %d, #taggeds= %d' % (len(sentences), len(answers), len(taggeds)))
    if not (len(sentences) == len(answers) and len(answers) == len(taggeds)):
        raise ValueError('not equal length')
    
    n_errors = 0
    n_apos_is_tpos_t = 0
    n_apos_is_tpos_f = 0
    n_apos_is_tneg = 0
    n_aneg_is_tpos = 0
    n_aneg_is_tneg = 0
    stop = False
    
    for k, (s, a, t) in enumerate(zip(sentences, answers, taggeds)):
        if stop: break
        if k % 100 == 0:
            sys.stdout.write('\r... %d in %d' % (k+1, len(answers)))
            
        if not (len(s) == len(a) and len(a) == len(t)):
            n_errors += 1
            continue
        for e, ai, ti in zip(s, a, t):
            try:
                la, ra, a_is_Noun = to_lr(e, *eojeol_to_wt(ai))
                lt, rt, t_is_Noun = to_lr(e, *eojeol_to_wt(ti))
                if not (la in answer_noun_counter):
                    continue
            except Exception as e:
#                 stop = True
                n_errors += 1
                continue
            
            if a_is_Noun and t_is_Noun:
                if la == lt: n_apos_is_tpos_t += 1
                else: n_apos_is_tpos_f += 1
            elif a_is_Noun and not t_is_Noun: n_apos_is_tneg += 1
            elif not a_is_Noun and t_is_Noun: n_aneg_is_tpos += 1
            else: n_aneg_is_tneg += 1
    
    return n_errors, n_apos_is_tpos_t, n_apos_is_tpos_f, n_apos_is_tneg, n_aneg_is_tpos, n_aneg_is_tneg


In [45]:
performances = {}
print('twitter')
performances['twitter'] = accuracy(sentences, answers, load(sentence_twitter_fname))

print('\n\nkkma')
performances['kkma'] = accuracy(sentences, answers, load(sentence_kkma_fname))

print('\n\nhannanum')
performances['hannanum'] = accuracy(sentences, answers, load(sentence_hannanum_fname))

twitter
#sentences= 1054566, #answers= 1054566, #taggeds= 1054566
... 1054501 in 1054566

kkma
#sentences= 1054566, #answers= 1054566, #taggeds= 1054566
... 1054501 in 1054566

hannanum
#sentences= 1054566, #answers= 1054566, #taggeds= 1054566
... 1054501 in 1054566

In [46]:
performances

{'hannanum': (11387, 3683456, 347378, 76220, 84957, 221331),
 'kkma': (13046, 3956310, 82752, 67536, 45493, 260714),
 'twitter': (12021, 3587705, 136442, 382898, 235123, 71090)}

In [51]:
for tagger_name, performance in performances.items():
    print('\n\n# tagger= %s' % tagger_name)
    
    n_errors, n_apos_is_tpos_t, n_apos_is_tpos_f, n_apos_is_tneg, n_aneg_is_tpos, n_aneg_is_tneg = performance
    
    print('a=True, p=True, diff str: ', n_apos_is_tpos_f / (n_apos_is_tpos_t + n_apos_is_tpos_f + n_aneg_is_tneg))
    print('a=True, p=True, same str: ', n_apos_is_tpos_t / (n_apos_is_tpos_t + n_apos_is_tpos_f + n_aneg_is_tneg))
    print('accuracy wo str: ', (n_apos_is_tpos_f + n_apos_is_tpos_t) / (n_apos_is_tpos_t + n_apos_is_tpos_f + n_aneg_is_tneg))
    print('accuracy w str: ', (n_apos_is_tpos_t) / (n_apos_is_tpos_t + n_apos_is_tpos_f + n_aneg_is_tneg))
    print('recall wo str: ', (n_apos_is_tpos_f + n_apos_is_tpos_t) / (n_apos_is_tpos_t + n_apos_is_tpos_f + n_apos_is_tneg))
    print('recall w str: ', n_apos_is_tpos_t / (n_apos_is_tpos_t + n_apos_is_tpos_f + n_apos_is_tneg))
    print('precision wo str: ', (n_apos_is_tpos_f + n_apos_is_tpos_t) / (n_apos_is_tpos_t + n_apos_is_tpos_f + n_aneg_is_tpos))
    print('precision w str: ', (n_apos_is_tpos_t) / (n_apos_is_tpos_t + n_apos_is_tpos_f + n_aneg_is_tpos))



# tagger= kkma
a=True, p=True, diff str:  0.019245653727077875
a=True, p=True, same str:  0.9201200248571089
accuracy wo str:  0.9393656785841867
accuracy w str:  0.9201200248571089
recall wo str:  0.9835542704691328
recall w str:  0.9634032841782907
precision wo str:  0.9888621893939487
precision w str:  0.9686024548573835


# tagger= twitter
a=True, p=True, diff str:  0.03595085102722175
a=True, p=True, same str:  0.9453177759386304
accuracy wo str:  0.9812686269658522
accuracy w str:  0.9453177759386304
recall wo str:  0.9067704395739515
recall w str:  0.8735489871671726
precision wo str:  0.9406145577341277
precision w str:  0.9061531544956519


# tagger= hannanum
a=True, p=True, diff str:  0.08169438391972089
a=True, p=True, same str:  0.8662542493059417
accuracy wo str:  0.9479486332256627
accuracy w str:  0.8662542493059417
recall wo str:  0.9814416854514209
recall w str:  0.8968608642593937
precision wo str:  0.9793582813121463
precision w str:  0.894957008264025


In [48]:
import konlpy
konlpy.__version__

'0.4.4'