In [1]:
head = 'l30_r15'
directory = '../data/'
model_fname ='../models/Logistic + L2 (C=1.00) norm l30_r15.pkl'


import pickle    
from py.utils import load_data

x, y, x_words, vocabs = load_data(head, directory)
with open(model_fname, 'rb') as f:
    classifier = pickle.load(f)

x shape = (15715, 4551)
y shape = (15715,)
# features = 4551
# L words = 15715


In [2]:
coefficient = {vocabs[j]:coef for j, coef in enumerate(classifier.coef_[0])}

In [8]:
list(coefficient.items())[:5]

[('어오고', -0.0035026965225787533),
 ('어놓는', -0.0017447214860182797),
 ('느냐고', -0.027019952035748179),
 ('라느니', 0.0013030948275454116),
 ('으셔서', -0.3228328145095205)]

In [5]:
import sys
from soynlp.utils import get_process_memory


In [40]:
class TrainedNounExtractor:
    def __init__(self, coefficient, max_length=8):
        self._coef = coefficient
        self.lmax = max_length
        
    def extract(self, sents, min_count=10, min_noun_score=0.1):
        lrgraph = self._build_lrgraph(sents, min_count)
        scores = self._compute_noun_score(lrgraph)
        return scores, lrgraph
        scores = self._postprocessing(lrgraph, scores)

    def _build_lrgraph(self, sents, min_count, pruning_min_count=2):
        lset = {}
        rset = {}
        for n_sent, sent in enumerate(sents):
            for eojeol in sent.split():
                for e in range(1, min(len(eojeol), self.lmax)+1):
                    l = eojeol[:e]
                    r = eojeol[e:]
                    lset[l] = lset.get(l, 0) + 1
                    rset[r] = rset.get(r, 0) + 1
            if n_sent % 1000 == 999:
                args = (n_sent+1, len(lset), len(rset), get_process_memory())
                sys.stdout.write('\rscaning vocabulary ... %d sents #(l= %d, r= %d), mem= %.3f Gb' % args)
            if n_sent % 500000 == 499999:
                lset = {l:f for l,f in lset.items() if f >= pruning_min_count}
                rset = {l:f for l,f in rset.items() if f >= pruning_min_count}
        lset = {l:f for l,f in lset.items() if f >= min_count}
        rset = {l:f for l,f in lset.items() if f >= min_count}
        
        n_sents = n_sent
        
        lrgraph = {}
        for n_sent, sent in enumerate(sents):
            for eojeol in sent.split():
                for e in range(1, min(len(eojeol), self.lmax)+1):
                    l = eojeol[:e]
                    r = eojeol[e:]
                    if not (l in lset) or not (r in rset):
                        continue
                    rdict = lrgraph.get(l, {})
                    rdict[r] = rdict.get(r, 0) + 1
                    lrgraph[l] = rdict            
            if n_sent % 1000 == 999:
                args = (100*(n_sent+1)/n_sents, '%', n_sent+1, n_sents, get_process_memory())
                sys.stdout.write('\rbuilding lrgraph ... (%.3f %s, %d in %d), mem= %.3f Gb' % args)
        args = (len(lset), len(rset), sum((len(rdict) for rdict in lrgraph.values())), get_process_memory())
        print('\rlrgraph has been built. (#L= %d, #R= %d, #E=%d), mem= %.3f Gb' % args)
        return lrgraph
        
    def _compute_noun_score(self, lrgraph):
        from collections import namedtuple
        Score = namedtuple('Score', 'score frequency feature_fraction eojeol_fraction')
        scores = {}
        n = len(lrgraph)
        for i, (l, rdict) in enumerate(lrgraph.items()):
            rdict_ = {r:f for r,f in rdict.items() if r in self._coef}
            rsum = sum((f for r,f in rdict.items() if r != ''))
            frequency = rsum + rdict.get('', 0)
            feature_fraction = sum(rdict_.values()) / rsum if rsum > 0 else 0
            eojeol_fraction = 1 - rsum / frequency
            if not rdict_:
                score = 0
            else:
                score = sum(f*self._coef[r] for r, f in rdict_.items()) / sum(rdict_.values())
            scores[l] = Score(score, frequency, feature_fraction, eojeol_fraction)
            if (i+1) % 1000 == 0:
                args = (100*(i+1)/n, '%', i+1, n)
                sys.stdout.write('\rcompute noun score ... (%.3f %s, %d in %d)' % args)
        print('\rcomputing noun score has been done.')
        return sorted(scores.items(), key=lambda x:x[1].score, reverse=True)
        
    def _postprocessing(self, lrgraph, scores):
        print('hi')

In [41]:
from config import sentence_fname
class Sentences:
    def __init__(self, fname):
        self.fname = fname
    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for doc in f:
                yield doc.strip()

sentences = Sentences(sentence_fname)
noun_extractor = TrainedNounExtractor(coefficient)
scores, lrgraph = noun_extractor.extract(sentences)


lrgraph has been built. (#L= 201559, #R= 201559, #E=1601616), mem= 2.148 Gb
computing noun score has been done.


## Preprocessor, Postprocessing의 필요성

- lrgraph를 만들 때 한글 외의 기호를 삭제한 어절로 이뤄져야 함 
- N = Nsub + J problem
    - 중세보편주 + {의, 의의}
- [N+Jsub] + J problem
    - 대학생과 + {'', 의}
    - {의, 과의} 모두 조사


In [43]:
lrgraph['중세보편주']

{'의': 6, '의의': 2}

In [44]:
lrgraph['대학생과']

{',': 1, '의': 1}

In [49]:
sorted(lrgraph['자유주'].items(), key=lambda x:x[1], reverse=True)[:10]

[('의적', 50),
 ('의', 27),
 ('의의', 14),
 ('의가', 8),
 ('의로', 4),
 ('의자', 2),
 ('의자와', 1),
 ('의해서', 1),
 ('의자는', 1),
 ('의자를', 1)]

In [48]:
list(filter(lambda x:x[1].frequency > 100 and len(x[0]) > 1, scores))[:10]

[('자유주',
  Score(score=6.2108250932929217, frequency=113, feature_fraction=0.23893805309734514, eojeol_fraction=0.0)),
 ('우르',
  Score(score=6.2108250932929217, frequency=130, feature_fraction=0.007692307692307693, eojeol_fraction=0.0)),
 ('낭만주',
  Score(score=6.2108250932929217, frequency=107, feature_fraction=0.37383177570093457, eojeol_fraction=0.0)),
 ('권위주',
  Score(score=6.2108250932929217, frequency=167, feature_fraction=0.47305389221556887, eojeol_fraction=0.0)),
 ('지역사',
  Score(score=6.2108250932929217, frequency=241, feature_fraction=0.004149377593360996, eojeol_fraction=0.0)),
 ('민주주',
  Score(score=6.2108250932929217, frequency=593, feature_fraction=0.3254637436762226, eojeol_fraction=0.0)),
 ('제국주',
  Score(score=6.2108250932929217, frequency=258, feature_fraction=0.4069767441860465, eojeol_fraction=0.0)),
 ('공산주',
  Score(score=6.2108250932929217, frequency=334, feature_fraction=0.5568862275449101, eojeol_fraction=0.0)),
 ('사회주',
  Score(score=6.2108250932929217, frequen