## Word adjacent graph

In [1]:
import config
from navernews_10days import get_news_paths
from soynlp.utils import DoublespaceLineCorpus

path = get_news_paths(date='2016-10-20', tokenize='komoran')
sents = DoublespaceLineCorpus(path, iter_sent=True, num_sent=10000)

def tokenizer(sent):
    words = sent.split()
    words = [w for w in words if ('/N' in w) or ('/V' in w)]
    return words

sent = '성씨/NNP 는/JX 강북/NNP 서/JKB 인근/NNG 치킨/NNP 집/NNG 까지/JX 이씨/NNP 뒤/NNG 를/JKO 쫓/VV 으며/EC 실랑이/NNG 하/XSV 다/EC 쓰러뜨리/VV ㄴ/ETM 후/NNG 총기/NNG 와/JC 함께/MAG 가져오/VV ㄴ/ETM 망치/NNP 로/JKB 이씨/NNP 머리/NNG 를/JKO 때리/VV 었/EP 다/EC'
tokenizer(sent)

soynlp=0.0.491
added lovit_textmining_dataset


['성씨/NNP',
 '강북/NNP',
 '인근/NNG',
 '치킨/NNP',
 '집/NNG',
 '이씨/NNP',
 '뒤/NNG',
 '쫓/VV',
 '실랑이/NNG',
 '쓰러뜨리/VV',
 '후/NNG',
 '총기/NNG',
 '가져오/VV',
 '망치/NNP',
 '이씨/NNP',
 '머리/NNG',
 '때리/VV']

In [2]:
from graphutils import sents_to_adjacent_graph

X, idx_to_vocab = sents_to_adjacent_graph(sents, tokenizer,
    min_count=10, min_cooccurrence=3, verbose=True)

X.shape

construct adjacent graph was done                    
transforming dict to sparse was done                    


(3430, 3430)

## PageRank

PageRank 의 구현체입니다. matrix multiplication 으로 구현하면 빠른 속도로 계산이 가능합니다.

In [3]:
import numpy as np
import scipy.sparse as sp
from sklearn.preprocessing import normalize


def pagerank(inbound_matrix, df=0.85, max_iter=30,
    bias=None, ranksum=1.0, converge_threshold=0.0001):

    converge_threshold_ = ranksum * converge_threshold
    n_nodes, initial_weight, rank, bias = _initialize_rank_parameters(
        inbound_matrix, df, bias, ranksum)

    for n_iter in range(1, max_iter + 1):
        rank_new = _update_pagerank(inbound_matrix, rank, bias, df, ranksum)
        diff = np.sqrt(((rank - rank_new) **2).sum())
        rank = rank_new

        if diff <= converge_threshold_:
            print('Early stop. because it already converged.')
            break
        print('iter {}, diff = {}'.format(n_iter, diff))
    return rank

def _initialize_rank_parameters(inbound_matrix, df, bias, ranksum):
    # Check number of nodes and initial weight
    n_nodes = inbound_matrix.shape[0]
    initial_weight = ranksum / n_nodes

    # Initialize rank and bias
    rank = np.asarray([initial_weight] * n_nodes)    
    if not bias:
        bias = rank.copy()
    elif not isinstance(bias, np.ndarray):
        raise ValueError('bias must be numpy.ndarray type or None')

    return n_nodes, initial_weight, rank, bias

def _update_pagerank(inbound_matrix, rank, bias, df, ranksum=1.0):
    # call scipy.sparse safe_sparse_dot()
    rank_new = inbound_matrix.dot(rank)
    rank_new = normalize(rank_new.reshape(1, -1), norm='l2').reshape(-1) * ranksum
    rank_new = df * rank_new + (1 - df) * bias
    return rank_new

In [4]:
pr = pagerank(X)

iter 1, diff = 0.8466561348919707
iter 2, diff = 0.44062735983446144
iter 3, diff = 0.26686915892547636
iter 4, diff = 0.19848804324946864
iter 5, diff = 0.16350166069114333
iter 6, diff = 0.14425921489525345
iter 7, diff = 0.13235490490988563
iter 8, diff = 0.12375492077623557
iter 9, diff = 0.11667014101824227
iter 10, diff = 0.11034544071710771
iter 11, diff = 0.10447019812905214
iter 12, diff = 0.09891811041387799
iter 13, diff = 0.09363769044597278
iter 14, diff = 0.0886070034445748
iter 15, diff = 0.08381515790090775
iter 16, diff = 0.07925490557011872
iter 17, diff = 0.0749197436294102
iter 18, diff = 0.07080293571301226
iter 19, diff = 0.06689723521148647
iter 20, diff = 0.06319493137026334
iter 21, diff = 0.059687959926565906
iter 22, diff = 0.05636805162041969
iter 23, diff = 0.053226854437629145
iter 24, diff = 0.05025604300006645
iter 25, diff = 0.04744740251625334
iter 26, diff = 0.04479289258518155
iter 27, diff = 0.042284697415324465
iter 28, diff = 0.039915256814848914


soygraph 의 PageRank 에 위의 코드를 정리해 두었습니다.

In [5]:
from soygraph.ranking import PageRank

pr = PageRank().rank(X)

iter 1 : diff = 0.8466561348919707 (0.001 sec)
iter 2 : diff = 0.44062735983446144 (0.001 sec)
iter 3 : diff = 0.26686915892547636 (0.001 sec)
iter 4 : diff = 0.19848804324946864 (0.001 sec)
iter 5 : diff = 0.16350166069114333 (0.001 sec)
iter 6 : diff = 0.14425921489525345 (0.001 sec)
iter 7 : diff = 0.13235490490988563 (0.001 sec)
iter 8 : diff = 0.12375492077623557 (0.001 sec)
iter 9 : diff = 0.11667014101824227 (0.001 sec)
iter 10 : diff = 0.11034544071710771 (0.001 sec)
iter 11 : diff = 0.10447019812905214 (0.001 sec)
iter 12 : diff = 0.09891811041387799 (0.001 sec)
iter 13 : diff = 0.09363769044597278 (0.001 sec)
iter 14 : diff = 0.0886070034445748 (0.001 sec)
iter 15 : diff = 0.08381515790090775 (0.001 sec)
iter 16 : diff = 0.07925490557011872 (0.001 sec)
iter 17 : diff = 0.0749197436294102 (0.001 sec)
iter 18 : diff = 0.07080293571301226 (0.001 sec)
iter 19 : diff = 0.06689723521148647 (0.001 sec)
iter 20 : diff = 0.06319493137026334 (0.001 sec)
iter 21 : diff = 0.0596879599265

## TextRank

Rank 가 높은 단어들을 선택하면 `서울`, `연합뉴스`와 같은 일반 명사들이 키워드로 선택됩니다. 이유는 문서 집합이 여러 주제가 섞여 있기 때문입니다.

In [6]:
top_idxs = pr.argsort()[::-1][:100]
top_ranks = pr[top_idxs]

for i, (idx, rank) in enumerate(zip(top_idxs, top_ranks)):
    vocab = idx_to_vocab[idx]
    if i % 5 == 0:
        print('\n| ', end='')
    print('{:6}'.format(vocab, rank), end=' | ')


| 서울/NNP | 연합뉴스/NNP | 일/NNB  | 이/VCP  | 오전/NNP | 
| 있/VV   | 이/NP   | 것/NNB  | 수/NNB  | 제공/NNG | 
| 하/VV   | 기자/NNG | 오후/NNG | 자료/NNG | 밝히/VV  | 
| 말/NNG  | 현지/NNG | 예정/NNG | 고양/NNP | 국회/NNG | 
| 되/VV   | 중/NNB  | 부산/NNP | 지나/VV  | 없/VA   | 
| 않/VX   | 계획/NNG | 백/NR   | 김/NNP  | 시/NNB  | 
| 워싱턴/NNP | 서명/NNG | 년/NNB  | 받/VV   | 보이/VV  | 
| 맞/VV   | 때문/NNB | 보/VV   | 대통령/NNG | 김현태/NNP | 
| 배/NNG  | 대표/NNG | 김주성/NNP | 명/NNB  | 경기도/NNP | 
| 장관/NNG | 알/VV   | 미국/NNP | 가/VV   | 등/NNB  | 
| 이정훈/NNP | 이날/NNG | 열리/VV  | 늘/VV   | 제주/NNP | 
| 트럼프/NNP | 세종/NNP | 황/NNP  | 서초구/NNP | 오/VV   | 
| 박철홍/NNP | 전망/NNG | 주장/NNG | 원/NNB  | 전하/VV  | 
| 주/VX   | 조/NR   | 클린턴/NNP | 홍기/NNP | 류/NNP  | 
| 홍해/NNP | 설명/NNG | 위하/VV  | 여의도/NNP | 천/NR   | 
| 시작/NNG | 강남구/NNP | 자료/NNP | 윤/NNP  | 지난달/NNG | 
| 발언/NNG | 김주형/NNP | 위원장/NNG | 신준희/NA | 이재희/NNP | 
| 북한/NNP | 세종로/NNP | 사진/NNP | 따르/VV  | 중앙/NNP | 
| 서울시/NNP | 캡처/NNP | 지/VX   | 아니/VCN | 확인/NNG | 
| 임/NNP  | 달/NNG  | 모습/NNG | 시간/NNG | 마련/NNG | 