## Word adjacent graph

In [1]:
import config
from navernews_10days import get_news_paths
from soynlp.utils import DoublespaceLineCorpus

path = get_news_paths(date='2016-10-20', tokenize='komoran')
sents = DoublespaceLineCorpus(path, iter_sent=True, num_sent=10000)

def tokenizer(sent):
    words = sent.split()
    words = [w for w in words if ('/N' in w) or ('/V' in w)]
    return words

sent = '성씨/NNP 는/JX 강북/NNP 서/JKB 인근/NNG 치킨/NNP 집/NNG 까지/JX 이씨/NNP 뒤/NNG 를/JKO 쫓/VV 으며/EC 실랑이/NNG 하/XSV 다/EC 쓰러뜨리/VV ㄴ/ETM 후/NNG 총기/NNG 와/JC 함께/MAG 가져오/VV ㄴ/ETM 망치/NNP 로/JKB 이씨/NNP 머리/NNG 를/JKO 때리/VV 었/EP 다/EC'
tokenizer(sent)

soynlp=0.0.492
added lovit_textmining_dataset


['성씨/NNP',
 '강북/NNP',
 '인근/NNG',
 '치킨/NNP',
 '집/NNG',
 '이씨/NNP',
 '뒤/NNG',
 '쫓/VV',
 '실랑이/NNG',
 '쓰러뜨리/VV',
 '후/NNG',
 '총기/NNG',
 '가져오/VV',
 '망치/NNP',
 '이씨/NNP',
 '머리/NNG',
 '때리/VV']

In [2]:
from graphutils import sents_to_adjacent_graph

X, idx_to_vocab = sents_to_adjacent_graph(sents, tokenizer,
    min_count=10, min_cooccurrence=3, verbose=True)

print(X.shape)
print(X.nonzero()[0].shape)

construct adjacent graph was done                    
transforming dict to sparse was done                    
(3430, 3430)
(18816,)


In [3]:
X, idx_to_vocab = sents_to_adjacent_graph(sents, tokenizer,
    min_count=10, min_cooccurrence=3, window=-1, verbose=True)

print(X.shape)
print(X.nonzero()[0].shape)

construct adjacent graph was done                    
transforming dict to sparse was done                    
(3430, 3430)
(497512,)


## PageRank

PageRank 의 구현체입니다. matrix multiplication 으로 구현하면 빠른 속도로 계산이 가능합니다.

In [4]:
import numpy as np
import scipy.sparse as sp
from sklearn.preprocessing import normalize


def pagerank(inbound_matrix, df=0.85, max_iter=30):
    n_nodes, initial_weight, rank, bias = initialize_rank_parameters(inbound_matrix, df)

    for n_iter in range(1, max_iter + 1):
        rank_new = update_pagerank(inbound_matrix, rank, bias, df)
        diff = np.sqrt(((rank - rank_new) **2).sum())
        rank = rank_new
        print('iter {}, diff = {}'.format(n_iter, diff))

    return rank

def initialize_rank_parameters(inbound_matrix, df):
    # Check number of nodes and initial weight
    n_nodes = inbound_matrix.shape[0]
    initial_weight = 1 / n_nodes

    # Initialize rank and bias
    rank = np.asarray([initial_weight] * n_nodes)
    bias = rank.copy()
    return n_nodes, initial_weight, rank, bias

def update_pagerank(inbound_matrix, rank, bias, df):
    # call scipy.sparse safe_sparse_dot()
    rank_new = inbound_matrix.dot(rank)
    rank_new = normalize(rank_new.reshape(1, -1), norm='l2').reshape(-1)
    rank_new = df * rank_new + (1 - df) * bias
    return rank_new

In [5]:
pr = pagerank(X)

iter 1, diff = 0.8461472508410279
iter 2, diff = 0.0777242695496277
iter 3, diff = 0.007689184017572718
iter 4, diff = 0.0018597117256751708
iter 5, diff = 0.0007008521153693605
iter 6, diff = 0.00028895253132882245
iter 7, diff = 0.0001208755391157564
iter 8, diff = 5.068918444458727e-05
iter 9, diff = 2.1265807713830784e-05
iter 10, diff = 8.922442081999404e-06
iter 11, diff = 3.743625331167742e-06
iter 12, diff = 1.5707329574645392e-06
iter 13, diff = 6.590412604139278e-07
iter 14, diff = 2.7651768319780554e-07
iter 15, diff = 1.160200964195273e-07
iter 16, diff = 4.867921227165706e-08
iter 17, diff = 2.0424614080422035e-08
iter 18, diff = 8.569671135601644e-09
iter 19, diff = 3.5956262515624685e-09
iter 20, diff = 1.5086368010609492e-09
iter 21, diff = 6.329886936877739e-10
iter 22, diff = 2.6558579463019024e-10
iter 23, diff = 1.1143336226411139e-10
iter 24, diff = 4.675530644067518e-11
iter 25, diff = 1.9616589718925564e-11
iter 26, diff = 8.23153875546137e-12
iter 27, diff = 3.4

soygraph 의 PageRank 에 위의 코드를 정리해 두었습니다.

In [6]:
from soygraph.ranking import PageRank

pr = PageRank().rank(X)

iter 1 : diff = 0.8461472508410279 (0.002 sec)
iter 2 : diff = 0.0777242695496277 (0.002 sec)
iter 3 : diff = 0.007689184017572718 (0.001 sec)
iter 4 : diff = 0.0018597117256751708 (0.001 sec)
iter 5 : diff = 0.0007008521153693605 (0.001 sec)
iter 6 : diff = 0.00028895253132882245 (0.001 sec)
iter 7 : diff = 0.0001208755391157564 (0.001 sec)
Early stop. because it already converged.


## TextRank

Rank 가 높은 단어들을 선택하면 `서울`, `연합뉴스`와 같은 일반 명사들이 키워드로 선택됩니다. 이유는 문서 집합이 여러 주제가 섞여 있기 때문입니다.

In [7]:
top_idxs = pr.argsort()[::-1][:100]
top_ranks = pr[top_idxs]

for i, (idx, rank) in enumerate(zip(top_idxs, top_ranks)):
    vocab = idx_to_vocab[idx]
    if i % 5 == 0:
        print('\n| ', end='')
    print('{:6}'.format(vocab, rank), end=' | ')


| 이/VCP  | 있/VV   | 일/NNB  | 연합뉴스/NNP | 것/NNB  | 
| 하/VV   | 서울/NNP | 기자/NNG | 등/NNB  | 이/NP   | 
| 수/NNB  | 열리/VV  | 오전/NNP | 말/NNG  | 대하/VV  | 
| 년/NNB  | 위하/VV  | 되/VV   | 미국/NNP | 북한/NNP | 
| 않/VX   | 받/VV   | 미/NNP  | 밝히/VV  | 없/VA   | 
| 천/NR   | 명/NNB  | 관련/NNG | 원/NNB  | 만/NR   | 
| 가/VV   | 국회/NNG | 대표/NNG | 장관/NNG | 제공/NNG | 
| 씨/NNB  | 한국/NNP | 오후/NNG | 문제/NNG | 지/VX   | 
| 중/NNB  | 늘/VV   | 대통령/NNG | 보/VV   | 의원/NNG | 
| 트럼프/NNP | 우리/NP  | 이날/NNG | 회의/NNG | 그/NP   | 
| 국가/NNG | 따르/VV  | 오/VV   | 시간/NNG | 억/NR   | 
| 억제/NNG | 주/VX   | 간/NNB  | 아니/VCN | 앞/NNG  | 
| 김/NNP  | 지나/VV  | 정부/NNG | 중국/NNP | 관계자/NNG | 
| 통하/VV  | 대선/NNG | 을/NNG  | 조사/NNG | 시/NNB  | 
| 클린턴/NNP | 개/NNB  | 운영/NNG | 미사일/NNG | 현지/NNG | 
| 만들/VV  | 차/NNB  | 계획/NNG | 외교/NNG | 맞/VV   | 
| 부산/NNP | 이번/NNG | 대/NNB  | 뒤/NNG  | 보이/VV  | 
| 왼쪽/NNG | 같/VA   | 확장/NNG | 크/VA   | 경찰/NNG | 
| 참석/NNG | 때문/NNB | 때/NNG  | 교육/NNG | 주장/NNG | 
| 위원회/NNP | 핵/NNG  | 발언/NNG | 사업/NNG | 수석/NNP | 