In [1]:
head = 'l30_r15'
directory = '../data/'
model_fname ='../models/Logistic + L2 (C=1.00) norm l30_r15.pkl'


import pickle    
from py.utils import load_data

x, y, x_words, vocabs = load_data(head, directory)
with open(model_fname, 'rb') as f:
    classifier = pickle.load(f)

x shape = (15715, 4551)
y shape = (15715,)
# features = 4551
# L words = 15715


In [2]:
coefficient = {vocabs[j]:coef for j, coef in enumerate(classifier.coef_[0])}

In [8]:
list(coefficient.items())[:5]

[('어오고', -0.0035026965225787533),
 ('어놓는', -0.0017447214860182797),
 ('느냐고', -0.027019952035748179),
 ('라느니', 0.0013030948275454116),
 ('으셔서', -0.3228328145095205)]

In [5]:
import sys
from soynlp.utils import get_process_memory
import numpy as np

In [109]:
class TrainedNounExtractor:
    def __init__(self, coefficient, max_length=8):
        self._coef = coefficient
        self.lmax = max_length
        
    def extract(self, sents, min_count=10, min_noun_score=0.1):
        self.lrgraph, self.lset, self.rset = self._build_lrgraph(sents, min_count)
        self.lentropy, self.rentropy = self._branching_entropy(lset, rset)
        scores = self._compute_noun_score(self.lrgraph)
        return scores, self.lrgraph, self.lset, self.rset
        scores = self._postprocessing(lrgraph, scores)

    def _build_lrgraph(self, sents, min_count, pruning_min_count=2):
        lset = {}
        rset = {}
        for n_sent, sent in enumerate(sents):
            for eojeol in sent.split():
                for e in range(1, min(len(eojeol), self.lmax)+1):
                    l = eojeol[:e]
                    r = eojeol[e:]
                    lset[l] = lset.get(l, 0) + 1
                    rset[r] = rset.get(r, 0) + 1
            if n_sent % 1000 == 999:
                args = (n_sent+1, len(lset), len(rset), get_process_memory())
                sys.stdout.write('\rscaning vocabulary ... %d sents #(l= %d, r= %d), mem= %.3f Gb' % args)
            if n_sent % 500000 == 499999:
                lset = {l:f for l,f in lset.items() if f >= pruning_min_count}
                rset = {l:f for l,f in rset.items() if f >= pruning_min_count}
        lset = {l:f for l,f in lset.items() if f >= min_count}
        rset = {l:f for l,f in rset.items() if f >= min_count}
        
        n_sents = n_sent
        
        lrgraph = {}
        for n_sent, sent in enumerate(sents):
            for eojeol in sent.split():
                for e in range(1, min(len(eojeol), self.lmax)+1):
                    l = eojeol[:e]
                    r = eojeol[e:]
                    if not (l in lset) or not (r in rset):
                        continue
                    rdict = lrgraph.get(l, {})
                    rdict[r] = rdict.get(r, 0) + 1
                    lrgraph[l] = rdict            
            if n_sent % 1000 == 999:
                args = (100*(n_sent+1)/n_sents, '%', n_sent+1, n_sents, get_process_memory())
                sys.stdout.write('\rbuilding lrgraph ... (%.3f %s, %d in %d), mem= %.3f Gb' % args)
        args = (len(lset), len(rset), sum((len(rdict) for rdict in lrgraph.values())), get_process_memory())
        print('\rlrgraph has been built. (#L= %d, #R= %d, #E=%d), mem= %.3f Gb' % args)
        return lrgraph, lset, rset
    
    def _branching_entropy(self, lset, rset):
        def entropy(d):
            sum_ = sum(d.values())
            if sum_ == 0: return 0
            return -1 * sum((v/sum_) * np.log(v/sum_) for v in d.values())
        def to_branching_map(d, get_root=lambda x:x[:-1]):
            tree = {}
            for w,f in d.items():
                if len(w) <= 1: continue
                root = get_root(w)
                branch = tree.get(root, {})
                branch[w] = branch.get(w, 0) + f
                tree[root] = branch
            return tree
        print('compute branching entropy ...', end='')
        lentropy = {l:entropy(branch) for l, branch in to_branching_map(lset).items()}
        rentropy = {r:entropy(branch) for r, branch in to_branching_map(rset, get_root=lambda x:x[1:]).items()}    
        print(' done')
        return lentropy, rentropy
        
    def _compute_noun_score(self, lrgraph):
        from collections import namedtuple
        Score = namedtuple('Score', 'score frequency branching_entropy feature_fraction eojeol_fraction')
        scores = {}
        n = len(lrgraph)
        for i, (l, rdict) in enumerate(lrgraph.items()):
            rdict_ = {r:f for r,f in rdict.items() if r in self._coef}
            rsum = sum((f for r,f in rdict.items() if r != ''))
            frequency = rsum + rdict.get('', 0)
            feature_fraction = sum(rdict_.values()) / rsum if rsum > 0 else 0
            eojeol_fraction = 1 - rsum / frequency
            if not rdict_:
                score = 0
            else:
                score = sum(f*self._coef[r] for r, f in rdict_.items()) / sum(rdict_.values())
            scores[l] = Score(score, frequency, self.lentropy.get(l, 0), feature_fraction, eojeol_fraction)
            if (i+1) % 1000 == 0:
                args = (100*(i+1)/n, '%', i+1, n)
                sys.stdout.write('\rcompute noun score ... (%.3f %s, %d in %d)' % args)
        print('\rcomputing noun score has been done.')
        return sorted(scores.items(), key=lambda x:x[1].score, reverse=True)
        
    def _postprocessing(self, lrgraph, scores):
        print('hi')

In [110]:
from config import sentence_fname
class Sentences:
    def __init__(self, fname):
        self.fname = fname
    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for doc in f:
                yield doc.strip()

sentences = Sentences(sentence_fname)
noun_extractor = TrainedNounExtractor(coefficient)
scores, lrgraph, lset, rset = noun_extractor.extract(sentences)

lentropy, rentropy = noun_extractor.lentropy, noun_extractor.rentropy

lrgraph has been built. (#L= 201559, #R= 86324, #E=2487566), mem= 2.068 Gb
compute branching entropy ... done
computing noun score has been done.


## Preprocessor, Postprocessing의 필요성

- lrgraph를 만들 때 한글 외의 기호를 삭제한 어절로 이뤄져야 함 
- N = Nsub + J problem
    - 중세보편주 + {의, 의의}
- [N+Jsub] + J problem
    - 대학생과 + {'', 의}
    - {의, 과의} 모두 조사


In [98]:
lrgraph['중세보편주']

{'의': 6, '의를': 7, '의의': 2}

In [99]:
lrgraph['대학생과']

{'': 24, ',': 1, '의': 1}

In [100]:
sorted(lrgraph['대학생'].items(), key=lambda x:x[1], reverse=True)[:10]

[('', 165),
 ('이', 74),
 ('들이', 73),
 ('들의', 67),
 ('들은', 36),
 ('의', 26),
 ('을', 26),
 ('과', 24),
 ('들을', 23),
 ('은', 21)]

In [111]:
list(filter(lambda x:x[1].frequency > 100 and len(x[0]) > 1, scores))[:10]

[('개인과',
  Score(score=6.2108250932929217, frequency=124, branching_entropy=0, feature_fraction=0.75, eojeol_fraction=0.967741935483871)),
 ('로버트',
  Score(score=6.2108250932929217, frequency=175, branching_entropy=0, feature_fraction=0.25, eojeol_fraction=0.9771428571428571)),
 ('자유주',
  Score(score=6.2108250932929217, frequency=182, branching_entropy=-0.0, feature_fraction=0.14835164835164835, eojeol_fraction=0.0)),
 ('한낱',
  Score(score=6.2108250932929217, frequency=127, branching_entropy=0, feature_fraction=1.0, eojeol_fraction=0.984251968503937)),
 ('우르',
  Score(score=6.2108250932929217, frequency=158, branching_entropy=0.40846145222649699, feature_fraction=0.006369426751592357, eojeol_fraction=0.006329113924050667)),
 ('그런대로',
  Score(score=6.2108250932929217, frequency=188, branching_entropy=0, feature_fraction=0.5, eojeol_fraction=0.9893617021276596)),
 ('당신과',
  Score(score=6.2108250932929217, frequency=128, branching_entropy=0, feature_fraction=0.8888888888888888, eojeol_fra

In [112]:
list(scores)[:10]

[('좌익과',
  Score(score=6.2108250932929217, frequency=17, branching_entropy=0, feature_fraction=1.0, eojeol_fraction=0.8235294117647058)),
 ('과거부터',
  Score(score=6.2108250932929217, frequency=22, branching_entropy=0, feature_fraction=1.0, eojeol_fraction=0.9545454545454546)),
 ('<시인',
  Score(score=6.2108250932929217, frequency=17, branching_entropy=0, feature_fraction=0.5, eojeol_fraction=0.8823529411764706)),
 ('負',
  Score(score=6.2108250932929217, frequency=3, branching_entropy=0, feature_fraction=1.0, eojeol_fraction=0.0)),
 ('<밤',
  Score(score=6.2108250932929217, frequency=4, branching_entropy=0, feature_fraction=0.25, eojeol_fraction=0.0)),
 ('각권',
  Score(score=6.2108250932929217, frequency=17, branching_entropy=0, feature_fraction=1.0, eojeol_fraction=0.9411764705882353)),
 ('개인과',
  Score(score=6.2108250932929217, frequency=124, branching_entropy=0, feature_fraction=0.75, eojeol_fraction=0.967741935483871)),
 ('자와',
  Score(score=6.2108250932929217, frequency=82, branching_e

## Dev: post-processing

### branching entropy는 좀 구해볼까? 
    
    L, right-side 
    R, left-side
    
    ## Complete
    
    >>> for w in ['대학', '대학생', '대학생과']:
    >>>     print(w, '%.3f' % lentropy.get(w, 0))

    대학 2.832
    대학생 1.721
    대학생과 0.000

In [146]:
score_dict = dict(scores)

# useful function
def pretty(namedtuple_instance):
    print('%s(%s)' % (namedtuple_instance.__class__.__name__, ', '.join(['%s=%.3f' % (field, value) if type(value) == float else '%s=%d' % (field, value) for field, value in namedtuple_instance._asdict().items()])))
    
pretty(score_dict['떡볶'])
pretty(score_dict['떡볶이'])

Score(score=1, frequency=67, branching_entropy=0, feature_fraction=0.761, eojeol_fraction=0.000)
Score(score=2, frequency=64, branching_entropy=0, feature_fraction=0.739, eojeol_fraction=0.281)


### Nsub + J: 떡볶 + 이

    f(떡볶) ~= f(떡볶이): drop-rate, branching entropy로 하자
    (떡볶이 in Noun) and (이 in Josa)
    주로 한글자 짜리 조사가 문제 

### [N + Jsub] + J: 대학생과 + 의

    f(대학생) >> f(대학생과)
    Right side branching entropy(대학생) ~= high
    (과의 in Josa) and (대학생 in Noun)

### Compound: 소수 + [집단 + 의]

    소수 + 집단의 (집단의 = 집단 + 의 인지 확인)
    Noun score 대체
