In [20]:
head = 'l30_r15'
directory = '../data/'
model_fname ='../models/Logistic + L2 (C=1.00) norm l30_r15.pkl'


import pickle    
from py.utils import load_data, Sentences
from py.noun import TrainedNounExtractor
from config import sentence_fname, sentence_tagged_fname

X, y, x_words, vocabs = load_data(head, directory)
with open(model_fname, 'rb') as f:
    classifier = pickle.load(f)
    
def load(fname, tagged=True):
    split = lambda doc: doc.replace('//','').strip().split()
    with open(fname, encoding='utf-8') as f:
        docs = [[eojeol.split('+') if tagged else eojeol for eojeol in split(doc)] for doc in f]
    return docs

answers = load(sentence_tagged_fname)
sentences = load(sentence_fname, False)

len(answers), len(sentences)

x shape = (15106, 2770)
y shape = (15106,)
# features = 2770
# L words = 15106


(1054566, 1054566)

In [24]:
from collections import Counter

answer_noun_counter = Counter((word.split('/')[0] for doc in answers for eojeol in doc for word in eojeol if '/N' in word and len(word.split('/')[0]) > 1 ) )
answer_noun_counter = {word:freq for word, freq in answer_noun_counter.items() if freq >= 10}

sorted(answer_noun_counter.items(), key=lambda x:x[1], reverse=True)[:20]

[('사람', 54901),
 ('우리', 42698),
 ('생각', 31221),
 ('때문', 28192),
 ('그것', 25966),
 ('사회', 21304),
 ('문제', 18652),
 ('경우', 17210),
 ('하나', 16427),
 ('그녀', 15750),
 ('자신', 15676),
 ('시간', 14118),
 ('자기', 13577),
 ('아이', 12964),
 ('시작', 12703),
 ('소리', 12399),
 ('세계', 12343),
 ('정도', 12121),
 ('인간', 11993),
 ('한국', 11751)]

In [11]:
coefficient = {vocabs[j]:coef for j, coef in enumerate(classifier.coef_[0])}
noun_extractor = TrainedNounExtractor(coefficient)
scores = noun_extractor.extract(Sentences('../data/sentences_onlyhangle.txt'), min_count=10, min_noun_score=0.1)
scores_ = dict(filter(lambda x:type(x[1]) != tuple and x[1].p_eojeol < 0.9, scores.items()))
compounds_ = dict(filter(lambda x:type(x[1]) == tuple, scores.items()))

lrgraph has been built. (#L= 173620, #R= 71297, #E=2022472), mem= 4.394 Gb
compute branching entropy ... done
computing noun score has been done.
n_candidates= 62448
n_nouns after substring processing= 52152
n_nouns after compound processing= 46248


In [16]:
def to_lr(e, w, t):
    from hangle import decompose, compose, jaum_begin, jaum_end
    tag = t[0][0]
    i = 0
    n = len(t)
    for i_, ti in enumerate(t):
        if t[0][0] == 'N' and ti[0] == 'V':
            break
        if t[0][0] == 'V' and (ti == 'ETN' and len(w[i_]) == 1 and jaum_begin <= ord(w[i_][0]) <= jaum_end):
            tag = 'N'
            break
        if not (ti[0] == 'N' or ti == 'XSN' or ti[:2] == 'VV' or ti[:2] == 'VA' or ti == 'XR'):
            break
        i = i_
    lw = e[:len(''.join(w[:i+1]))]
    r = e[len(lw):]
    
    # 아빤 = 아빠/N + ㄴ/J
    # 갈꺼야 = 가/V + ㄹ/E + 꺼야/E
    if (t[i][0] == 'N' or t[i][0] == 'V') and (i+1 < n) and (jaum_begin <= ord(w[i+1][0]) <= jaum_end):
        last_l = decompose(lw[-1])
        l0 = lw[:-1] + compose(last_l[0], last_l[1], ' ')
        return lw, r, tag == 'N'

    # 가? = 가/V + ㅏ/E + ?/S
    # 먹었어 = 먹/V + 었어/E
    return lw, r, tag.replace('X','N') == 'N'

def eojeol_to_wt(eojeol):
    w = [e.split('/')[0] for e in eojeol]
    t = [e.split('/')[1] for e in eojeol]
    return w,t

def left_tokenize(eojeol, nouns, reverse=True):
    for i in reversed(range(1, len(eojeol)+1)) if reverse else range(1, len(eojeol)+1):
        l = eojeol[:i]
        r = eojeol[i:]
        if l in nouns:
            return l, r, True
    return eojeol, '', False

In [25]:
def proposed_accuracy(sentences, answers, scores, reverse=True):
    import sys
    print('#sentences= %d, #answers= %d' % (len(sentences), len(answers)))
    if len(sentences) != len(answers):
        raise ValueError('not equal length')
    
    n_errors = 0
    n_apos_is_ppos_t = 0
    n_apos_is_ppos_f = 0
    n_apos_is_pneg = 0
    n_aneg_is_ppos = 0
    n_aneg_is_pneg = 0
    stop = False
    
    for k, (s, a) in enumerate(zip(sentences, answers)):
        if stop: break
        if k % 100 == 0:
            sys.stdout.write('\r... %d in %d' % (k+1, len(answers)))
        
        for e, ai in zip(s, a):
            try:
                la, ra, a_is_Noun = to_lr(e, *eojeol_to_wt(ai))
                lp, rp, p_is_Noun = left_tokenize(e, scores, reverse)
                if not (la in answer_noun_counter):
                    continue
            except Exception as e:
#                 stop = Trueleft_tokenize
                n_errors += 1
                continue
            
            if a_is_Noun and p_is_Noun:
                if la == lp: n_apos_is_ppos_t += 1
                else: n_apos_is_ppos_f += 1
            elif a_is_Noun and not p_is_Noun: n_apos_is_pneg += 1
            elif not a_is_Noun and p_is_Noun: n_aneg_is_ppos += 1
            else: n_aneg_is_pneg += 1
    print('\ndone')
    
    return n_errors, n_apos_is_ppos_t, n_apos_is_ppos_f, n_apos_is_pneg, n_aneg_is_ppos, n_aneg_is_pneg

In [29]:
# score, reverse=True
nouns = set(scores.keys())
nouns.update(set(compounds_.keys()))
n_errors, n_apos_is_ppos_t, n_apos_is_ppos_f, n_apos_is_pneg, n_aneg_is_ppos, n_aneg_is_pneg = proposed_accuracy(sentences, answers, nouns, True)
print('a=True, p=True, diff str: ', n_apos_is_ppos_f / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('a=True, p=True, same str: ', n_apos_is_ppos_t / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('accuracy wo str: ', (n_apos_is_ppos_f + n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('accuracy w str: ', (n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('recall wo str: ', (n_apos_is_ppos_f + n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_apos_is_pneg))
print('recall w str: ', n_apos_is_ppos_t / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_apos_is_pneg))
print('precision wo str: ', (n_apos_is_ppos_f + n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_ppos))
print('precision w str: ', (n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_ppos))

#sentences= 1054566, #answers= 1054566
... 1054501 in 1054566
done
a=True, p=True, diff str:  0.03309291627348113
a=True, p=True, same str:  0.9597533212243158
accuracy wo str:  0.9928462374977969
accuracy w str:  0.9597533212243158
recall wo str:  0.9913643301903533
recall w str:  0.9583208079041766
precision wo str:  0.9361807618708302
precision w str:  0.9049765830167971


In [30]:
# score_, reverse=True
nouns = set(scores_.keys())
nouns.update(set(compounds_.keys()))
n_errors, n_apos_is_ppos_t, n_apos_is_ppos_f, n_apos_is_pneg, n_aneg_is_ppos, n_aneg_is_pneg = proposed_accuracy(sentences, answers, nouns, True)
print('a=True, p=True, diff str: ', n_apos_is_ppos_f / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('a=True, p=True, same str: ', n_apos_is_ppos_t / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('accuracy wo str: ', (n_apos_is_ppos_f + n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('accuracy w str: ', (n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('recall wo str: ', (n_apos_is_ppos_f + n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_apos_is_pneg))
print('recall w str: ', n_apos_is_ppos_t / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_apos_is_pneg))
print('precision wo str: ', (n_apos_is_ppos_f + n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_ppos))
print('precision w str: ', (n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_ppos))

#sentences= 1054566, #answers= 1054566
... 1054501 in 1054566
done
a=True, p=True, diff str:  0.035012448298133214
a=True, p=True, same str:  0.9533350407321534
accuracy wo str:  0.9883474890302866
accuracy w str:  0.9533350407321534
recall wo str:  0.990058683511878
recall w str:  0.9549856157363041
precision wo str:  0.9401283068862559
precision w str:  0.9068240347513996


In [31]:
# score_, reverse=False
nouns = set(scores_.keys())
nouns.update(set(compounds_.keys()))
n_errors, n_apos_is_ppos_t, n_apos_is_ppos_f, n_apos_is_pneg, n_aneg_is_ppos, n_aneg_is_pneg = proposed_accuracy(sentences, answers, nouns, False)
print('a=True, p=True, diff str: ', n_apos_is_ppos_f / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('a=True, p=True, same str: ', n_apos_is_ppos_t / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('accuracy wo str: ', (n_apos_is_ppos_f + n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('accuracy w str: ', (n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_pneg))
print('recall wo str: ', (n_apos_is_ppos_f + n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_apos_is_pneg))
print('recall w str: ', n_apos_is_ppos_t / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_apos_is_pneg))
print('precision wo str: ', (n_apos_is_ppos_f + n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_ppos))
print('precision w str: ', (n_apos_is_ppos_t) / (n_apos_is_ppos_t + n_apos_is_ppos_f + n_aneg_is_ppos))

#sentences= 1054566, #answers= 1054566
... 1054501 in 1054566
done
a=True, p=True, diff str:  0.6742100810074885
a=True, p=True, same str:  0.31413740802279816
accuracy wo str:  0.9883474890302866
accuracy w str:  0.31413740802279816
recall wo str:  0.990058683511878
recall w str:  0.3146812958811034
precision wo str:  0.9401283068862559
precision w str:  0.2988113723280377
