In [9]:
import sys
sys.path.append('/mnt/lovit/git/soynlp/')
import soynlp
import pickle
with open('2016-10-20-nountokenizer.pkl', 'rb') as f:
    params = pickle.load(f)

In [10]:
x = params['x']
idx2vocab = params['idx2vocab']
vocab2idx = params['vocab2idx']
print(x.shape)

(30091, 96131)


In [11]:
from soynlp.word import pmi as pmi_func

bow_pmi = pmi_func(x, verbose=True)

computing pmi was done                              


In [12]:
type(bow_pmi)

scipy.sparse.dok.dok_matrix

In [13]:
bow_pmi = bow_pmi.tocsr()

In [14]:
# get ioi documents
document_idx = x[:,vocab2idx['아이오아이']].nonzero()[0]
print(document_idx)

[ 6884  6897  6956  7338  7345  7582  8011  8053  9180  9228  9494  9539
  9876  9894 13059 13231 13691 13856 14117 15573 15836 15868 15880 16198
 16485 16487 16489 16490 16492 17304 17441 17699 18072 18078 18240 20825
 20839 20902 21208 21247 21249 21251 21414 21481 21513 21631 21670 21674
 21747 21921 21949 22279 22282 22314 22316 22620 22731 23315 23562 23660
 23952 24977 25050 25145 25179 25181 25184 25277 25328 25832 26231 26235
 26317 26320 26364 26394 26396 26397 26706 26707 26709 26814 26816 26818
 26819 26821 27513 27664 27825 28927 28931 28947 28966 28969 28982 29670
 29745]


In [15]:
import numpy as np
from sklearn.metrics import pairwise_distances

def most_similar_docs(idx, x, topn=10):
    # query vector
    query = x[idx,:]
    if type(x) == np.ndarray:
        query = x[idx,:].reshape(1,-1)
    
    # pairwise distance
    dist = pairwise_distances(x, query, metric='cosine').reshape(-1)
    
    # find similar documents
    similar_idx = dist.copy().argsort()[:topn]
    
    # slice distance of similar documents
    dist = dist[similar_idx]
    
    # convert distance to similarity
    similars = [(idx, 1 - d) for idx, d in zip(similar_idx, dist)]

    return similars

def get_bow(idx, x, topn=10):
    # get non-empty entity indices
    idxs = np.arange(x.indptr[idx], x.indptr[idx+1])
    
    # get term index and frequencies
    terms = x.indices[idxs]
    frequencies = x.data[idxs]

    # make bow
    bow = [(idx2vocab[t], f) for t, f in zip(terms, frequencies)]

    # sort by frequency in decreasing order
    bow = sorted(bow, key=lambda x:(-x[1], x[0]))

    # frequency filtering
    if topn > 0:
        bow = bow[:topn]

    return bow

In [16]:
for idx, sim in most_similar_docs(6884, x):
    print('\ndocument #{}'.format(idx))
    print(get_bow(idx, x))


document #6884
[('아이오아이', 6), ('불독', 4), ('비슷', 3), ('활동', 3), ('01', 2), ('경쟁', 2), ('매일경제', 2), ('선의', 2), ('쇼케이스', 2), ('프로듀스', 2)]

document #21481
[('아이오아이', 5), ('불독', 3), ('20일', 2), ('경쟁', 2), ('만나', 2), ('카운트다운', 2), ('형은', 2), ('01', 1), ('5인조', 1), ('걸그룹', 1)]

document #25328
[('불독', 4), ('걸그룹', 3), ('아이오아이', 3), ('01', 2), ('20일', 2), ('롤링홀', 2), ('마포구', 2), ('멤버들', 2), ('쇼케이스', 2), ('자고', 2)]

document #6897
[('불독', 11), ('콘셉트', 5), ('강하', 4), ('걸그룹', 4), ('걸크러쉬', 4), ('아이오아이', 4), ('프로듀스101', 4), ('가요계', 3), ('형은', 3), ('경쟁', 2)]

document #27513
[('아이오아이', 4), ('불독', 3), ('멤버들', 2), ('쇼케이스', 2), ('인사', 2), ('자고', 2), ('프로그램', 2), ('형은', 2), ('1시', 1), ('20일', 1)]

document #17304
[('불독', 3), ('아이오아이', 3), ('노컷뉴스', 2), ('쇼케이스', 2), ('우리', 2), ('형은', 2), ('01', 1), ('1시', 1), ('20일', 1), ('5인조', 1)]

document #6888
[('불독', 4), ('강하', 3), ('팀명', 3), ('가요계', 2), ('걸그룹', 2), ('매일경제', 2), ('온순', 2), ('형은', 2), ('01', 1), ('20일', 1)]

document #27664
[('불독', 8), ('걸크러쉬', 5), 

In [17]:
for idx, sim in most_similar_docs(6884, bow_pmi):
    print('\ndocument #{}'.format(idx))
    print(get_bow(idx, x))


document #6884
[('아이오아이', 6), ('불독', 4), ('비슷', 3), ('활동', 3), ('01', 2), ('경쟁', 2), ('매일경제', 2), ('선의', 2), ('쇼케이스', 2), ('프로듀스', 2)]

document #21481
[('아이오아이', 5), ('불독', 3), ('20일', 2), ('경쟁', 2), ('만나', 2), ('카운트다운', 2), ('형은', 2), ('01', 1), ('5인조', 1), ('걸그룹', 1)]

document #6888
[('불독', 4), ('강하', 3), ('팀명', 3), ('가요계', 2), ('걸그룹', 2), ('매일경제', 2), ('온순', 2), ('형은', 2), ('01', 1), ('20일', 1)]

document #6889
[('불독', 3), ('세이', 3), ('지니', 3), ('진아', 3), ('매일경제', 2), ('열심히', 2), ('응원', 2), ('01', 1), ('20일', 1), ('5인조', 1)]

document #25328
[('불독', 4), ('걸그룹', 3), ('아이오아이', 3), ('01', 2), ('20일', 2), ('롤링홀', 2), ('마포구', 2), ('멤버들', 2), ('쇼케이스', 2), ('자고', 2)]

document #6897
[('불독', 11), ('콘셉트', 5), ('강하', 4), ('걸그룹', 4), ('걸크러쉬', 4), ('아이오아이', 4), ('프로듀스101', 4), ('가요계', 3), ('형은', 3), ('경쟁', 2)]

document #6861
[('매일경제', 2), ('무대', 2), ('01', 1), ('20일', 1), ('걸그룹', 1), ('구성', 1), ('김민지', 1), ('롤링홀', 1), ('무단', 1), ('불독', 1)]

document #6839
[('매일경제', 2), ('무대', 2), ('01', 1),

In [18]:
import time
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300)

process_time = time.time()
docvecs = svd.fit_transform(bow_pmi)
process_time = time.time() - process_time

In [19]:
process_time

1306.4729008674622

In [20]:
docvecs.shape

(30091, 300)

In [21]:
for idx, sim in most_similar_docs(6884, docvecs):
    print('\ndocument #{}'.format(idx))
    print(get_bow(idx, x))


document #6884
[('아이오아이', 6), ('불독', 4), ('비슷', 3), ('활동', 3), ('01', 2), ('경쟁', 2), ('매일경제', 2), ('선의', 2), ('쇼케이스', 2), ('프로듀스', 2)]

document #6889
[('불독', 3), ('세이', 3), ('지니', 3), ('진아', 3), ('매일경제', 2), ('열심히', 2), ('응원', 2), ('01', 1), ('20일', 1), ('5인조', 1)]

document #6888
[('불독', 4), ('강하', 3), ('팀명', 3), ('가요계', 2), ('걸그룹', 2), ('매일경제', 2), ('온순', 2), ('형은', 2), ('01', 1), ('20일', 1)]

document #21481
[('아이오아이', 5), ('불독', 3), ('20일', 2), ('경쟁', 2), ('만나', 2), ('카운트다운', 2), ('형은', 2), ('01', 1), ('5인조', 1), ('걸그룹', 1)]

document #21493
[('01', 3), ('기획사', 3), ('불독', 3), ('생각', 3), ('프로듀스', 3), ('20일', 2), ('서도', 2), ('연습', 2), ('저희', 2), ('형은', 2)]

document #21486
[('20일', 2), ('불독', 2), ('사촌언니', 2), ('응원해줬다', 2), ('진아', 2), ('진희', 2), ('친오빠', 2), ('01', 1), ('5인조', 1), ('가족관계', 1)]

document #21502
[('콘셉트', 5), ('걸크러쉬', 4), ('불독', 4), ('01', 3), ('프로듀스', 3), ('20일', 2), ('가능성', 2), ('선배님들', 2), ('소라', 2), ('앨범', 2)]

document #21497
[('걸크러쉬', 4), ('세이', 3), ('20일', 2), ('

In [22]:
wordvecs = svd.components_.transpose()

In [23]:
wordvecs.shape

(96131, 300)

In [24]:
def most_similar_terms(term, topn=10):
    # encoding term
    idx = vocab2idx.get(term, -1)
    if idx < 0:
        return []
    
    query = wordvecs[idx,:].reshape(1,-1)

    # pairwise distance
    dist = pairwise_distances(wordvecs, query, metric='cosine').reshape(-1)
    
    # find similar terms
    similar_idx = dist.copy().argsort()[:topn]

    # slice distance of similar terms
    dist = dist[similar_idx]

    # convert distance to similarity
    similars = [(idx, 1 - d) for idx, d in zip(similar_idx, dist)]
    
    # decode term index
    similars = [(idx2vocab[idx], sim) for idx, sim in similars]

    return similars

most_similar_terms('아이오아이')

[('아이오아이', 1.0),
 ('신용재', 0.9552835826769179),
 ('오블리스', 0.9547380602818735),
 ('너무너무너무', 0.9523250537862377),
 ('백퍼센트', 0.948786989664277),
 ('갓세븐', 0.9487118024619006),
 ('브레인', 0.9485585332530823),
 ('펜타곤', 0.939067391282393),
 ('미니팬미팅', 0.9341851792363978),
 ('컴백무대', 0.9332633597875625)]

In [25]:
eigenvalue = svd.explained_variance_

In [26]:
with open('2016-10-20-docvecs_bow_pmi.pkl', 'wb') as f:
    params2 = {
        'docvecs': docvecs,
        'wordvecs': wordvecs,
        'eigenvalue': eigenvalue
    }
    pickle.dump(params2, f)

In [27]:
from pprint import pprint

for word in ['박근혜', '대통령', '정부', '아프리카', '방송', '이화여대']:
    print('\n{}'.format(word))
    pprint(most_similar_terms(word))


박근혜
[('박근혜', 0.9999999999999997),
 ('대통령', 0.7988756238727761),
 ('전진', 0.634871453118881),
 ('콘서트관', 0.60674475714106),
 ('박수치고', 0.6007970609253678),
 ('미래교실관', 0.5999878373939997),
 ('온라인공개강의', 0.5978125040754223),
 ('광호', 0.5941498380214437),
 ('자유학기제홍보대사', 0.5819652384201459),
 ('공교육관', 0.571431717222399)]

대통령
[('대통령', 0.9999999999999994),
 ('박근혜', 0.7988756238727761),
 ('전진', 0.5679018736826036),
 ('수석비서관회의', 0.5524557205200482),
 ('청와대', 0.5109848021893318),
 ('수석비서관', 0.4816071830963704),
 ('집현실', 0.4749375355467407),
 ('경축사', 0.47215529075722174),
 ('광호', 0.44846383255937516),
 ('콘서트관', 0.44635857535256396)]

정부
[('정부', 1.0),
 ('대책', 0.47601975627780746),
 ('재건축시장', 0.4551833238767743),
 ('경제부', 0.4451573839138965),
 ('랴오닝성', 0.42082992194020497),
 ('정책', 0.42063731518519654),
 ('경제정책', 0.420576333853117),
 ('선대인경', 0.4181376693571981),
 ('정부역시', 0.41562144024380887),
 ('통화기구', 0.41562144024380887)]

아프리카
[('아프리카', 1.0),
 ('언어들', 0.6182983468452946),
 ('제국주의', 0.615921565180

In [28]:
from pprint import pprint

for word in ['식민지배', '케냐', '대도서관', '수요미식회']:
    print('\n{}'.format(word))
    pprint(most_similar_terms(word))


식민지배
[('식민지배', 0.9999999999999997),
 ('김지하', 0.9751327420357969),
 ('십자가', 0.9726322695447782),
 ('티옹오', 0.9721444217719373),
 ('기쿠유어', 0.9719829749699749),
 ('리즈대', 0.9679631525180397),
 ('투옥', 0.9669936889425889),
 ('기원국제회의실', 0.965253834057054),
 ('대중가수', 0.9645828690682032),
 ('언어전사', 0.9644906492669713)]

케냐
[('케냐', 1.0),
 ('비교문학', 0.9901423839967789),
 ('꽃잎', 0.9899747744222358),
 ('노벨문학상', 0.9884787991228627),
 ('식민지', 0.986683823204836),
 ('박경리문학상', 0.9775373584154181),
 ('하루키', 0.9762734329444406),
 ('시옹오', 0.9674095803682912),
 ('독립전쟁', 0.9653599630448637),
 ('뉴욕대학교', 0.9649866017424072)]

대도서관
[('대도서관', 1.0),
 ('윰댕', 0.8991838643983184),
 ('밴쯔', 0.8907429763808172),
 ('호스팅비', 0.8641165940455023),
 ('상업방송', 0.8234322329595634),
 ('광고방송', 0.7556187056879555),
 ('방송정지', 0.7401599045464253),
 ('아프리카보다', 0.7031076465188734),
 ('나동', 0.6940303146375215),
 ('수익금보다', 0.6485224235527318)]

수요미식회
[('수요미식회', 1.0),
 ('홍신애', 0.7746152697726325),
 ('뚜렷해서', 0.7127567736129532),
 ('게스트보다',

In [29]:
document_idx = x[:,vocab2idx['수요미식회']].nonzero()[0]
print(document_idx)

[ 7007 12382 17775 17938 21351 21382 23539 26944]


In [30]:
get_bow(17775, x)

[('김가연', 4),
 ('서울신문', 3),
 ('남편', 2),
 ('수요미식회', 2),
 ('전현무', 2),
 ('폭로', 2),
 ('19일', 1),
 ('게스트', 1),
 ('남부', 1),
 ('되니까', 1)]

In [31]:
for idx, sim in most_similar_docs(17775, docvecs, topn=20):
    print('\ndocument #{} (cosine={})'.format(idx, '%.3f'%sim))
    print(get_bow(idx, x))


document #17775 (cosine=1.000)
[('김가연', 4), ('서울신문', 3), ('남편', 2), ('수요미식회', 2), ('전현무', 2), ('폭로', 2), ('19일', 1), ('게스트', 1), ('남부', 1), ('되니까', 1)]

document #17791 (cosine=0.612)
[('김가연', 7), ('남자친구', 3), ('박보검', 3), ('서울신문', 3), ('하령', 3), ('임요환', 2), ('지목', 2), ('19일', 1), ('겼다', 1), ('공개', 1)]

document #17675 (cosine=0.581)
[('서울신문', 2), ('무단', 1), ('배포금지', 1), ('재미', 1)]

document #25477 (cosine=0.576)
[('박명수', 5), ('샤이니', 5), ('민호', 2), ('은진', 2), ('텐아시아', 2), ('해피투게더', 2), ('20일', 1), ('3573', 1), ('등장', 1), ('무단', 1)]

document #17858 (cosine=0.571)
[('손호영', 6), ('서울신문', 3), ('김태우', 2), ('놨다', 2), ('방송', 2), ('서문탁', 2), ('야기', 2), ('컬투쇼', 2), ('퍼포먼스', 2), ('20일', 1)]

document #17626 (cosine=0.569)
[('서울신문', 2), ('은행연합회', 2), ('전무이사', 2), ('19일', 1), ('32회', 1), ('경제학과', 1), ('경제협력개발기구', 1), ('공사참사관', 1), ('금융위원회', 1), ('금융허브기획과장', 1)]

document #17627 (cosine=0.565)
[('프로그램', 4), ('방송', 3), ('24개', 2), ('서울신문', 2), ('소년', 2), ('이다', 2), ('이름', 2), ('장학퀴즈', 2), ('19일', 1)

In [32]:
document_idx = x[:,vocab2idx['이대']].nonzero()[0]
print(document_idx)

[  129   134   162   614   702  1066  1497  1954  2144  2198  2297  2508
  2729  2749  2756  2770  4813  5114  5319  5480  5749  5790  5796  5852
  5862  6457  7089  7217  7524  7654  9422  9437  9443  9446  9590  9593
  9647  9793 10141 10599 11008 11282 11433 11585 11768 11945 11946 11997
 12054 12055 12056 12110 12111 12112 12114 12115 12117 12118 12119 12156
 12226 12285 12492 12511 12562 12601 12844 13658 13838 13939 14089 14156
 14173 14282 14404 14435 14477 14566 14911 14928 15052 15477 15647 17183
 17193 17270 17575 17592 17671 17688 17928 18146 18381 18531 21802 22594
 24097 24865 25231 29255 29399 29499 29519 30055]


In [33]:
get_bow(29499, x, topn=20)

[('교수', 9),
 ('전화', 6),
 ('지도교수', 4),
 ('경고', 3),
 ('윤수영', 3),
 ('학장', 3),
 ('고소', 2),
 ('그래', 2),
 ('자리', 2),
 ('정유라', 2),
 ('정유라씨', 2),
 ('주장', 2),
 ('최순실', 2),
 ('최순실씨', 2),
 ('최씨', 2),
 ('01', 1),
 ('166', 1),
 ('고성', 1),
 ('과정', 1),
 ('과제', 1)]

In [34]:
for idx, sim in most_similar_docs(29499, docvecs, topn=20):
    print('\ndocument #{} (cosine={})'.format(idx, '%.3f'%sim))
    print(get_bow(idx, x))


document #29499 (cosine=1.000)
[('교수', 9), ('전화', 6), ('지도교수', 4), ('경고', 3), ('윤수영', 3), ('학장', 3), ('고소', 2), ('그래', 2), ('자리', 2), ('정유라', 2)]

document #29519 (cosine=0.989)
[('교수', 10), ('전화', 6), ('경고', 3), ('윤수영', 3), ('이화', 3), ('지도교수', 3), ('총장', 3), ('최경희', 3), ('학장', 3), ('고소', 2)]

document #11945 (cosine=0.825)
[('교수', 10), ('정씨', 7), ('학장', 6), ('전화', 5), ('지도교수', 4), ('최씨', 4), ('경고', 3), ('교체', 3), ('학생들', 3), ('19일', 2)]

document #17954 (cosine=0.820)
[('교수', 10), ('전화', 4), ('지도교수', 4), ('최순실', 4), ('학장', 4), ('경고', 3), ('고소', 2), ('정유라', 2), ('19일', 1), ('고성', 1)]

document #12226 (cosine=0.820)
[('교수', 10), ('전화', 5), ('정유라', 4), ('최순실', 4), ('학장', 4), ('경고', 3), ('인터뷰', 3), ('지도교수', 3), ('고소', 2), ('19일', 1)]

document #17270 (cosine=0.807)
[('교수', 9), ('최씨', 5), ('경고', 4), ('정씨', 4), ('정유라', 4), ('지도교수', 4), ('학장', 4), ('전화', 3), ('고소', 2), ('노컷뉴스', 2)]

document #13658 (cosine=0.802)
[('교수', 9), ('지도교수', 5), ('이화', 4), ('교수님', 3), ('인터뷰', 3), ('정유라', 3), ('지도',

In [39]:
def get_wordvec(term):
    idx = vocab2idx.get(term, -1)
    if idx < 0 or idx >= wordvecs.shape[0]:
        return None
    return wordvecs[idx,:].reshape(1,-1)

get_wordvec('아이오아이').shape

(1, 300)

In [50]:
def get_most_similar_docs_from_term(term, topn=10):
    wordvec = get_wordvec(term)
    if wordvec is None:
        return []
    dist = pairwise_distances(wordvec, docvecs, metric='cosine').reshape(-1)
    similar_idx = dist.copy().argsort()[:topn]
    dist = dist[similar_idx]    
    similars = [(idx, 1-d) for idx, d in zip(similar_idx, dist)]
    return similars

In [57]:
for doc_idx, sim in get_most_similar_docs_from_term('최순실'):
    print('doc#{}: {}'.format(doc_idx, get_bow(doc_idx, x)))

doc#11027: [('꼬리', 2), ('동아일보', 2), ('의혹', 2), ('20일', 1), ('50분', 1), ('논란', 1), ('더블루케이', 1), ('사무실', 1), ('소유주', 1), ('스포츠재단', 1)]
doc#11438: [('스포츠재단', 9), ('의혹', 6), ('대통령', 4), ('정씨', 4), ('게이트', 3), ('서도', 3), ('설립', 3), ('최씨', 3), ('검찰', 2), ('공익재단', 2)]
doc#12077: [('독일', 4), ('가능성', 3), ('국내', 2), ('대비', 2), ('도피', 2), ('사실', 2), ('세탁', 2), ('의혹', 2), ('이다', 2), ('정유라씨', 2)]
doc#14606: [('모녀', 6), ('확인', 6), ('독일', 5), ('의혹', 5), ('최순실', 5), ('검찰', 4), ('출국', 4), ('사실', 3), ('가능성', 2), ('개입', 2)]
doc#14532: [('모녀', 6), ('확인', 6), ('독일', 5), ('의혹', 5), ('최순실', 5), ('검찰', 4), ('출국', 4), ('사실', 3), ('가능성', 2), ('개입', 2)]
doc#11030: [('의혹', 7), ('새누리당', 4), ('동아일보', 3), ('박근혜', 3), ('서도', 3), ('최순실', 3), ('검찰', 2), ('규명', 2), ('기자들', 2), ('논의', 2)]
doc#11997: [('스포츠재단', 6), ('문제', 4), ('의혹', 4), ('정씨', 4), ('관리', 3), ('대가', 3), ('모녀', 3), ('지도교수', 3), ('출석', 3), ('권력', 2)]
doc#10692: [('최순실', 12), ('주장', 9), ('국정조사', 8), ('대통령', 7), ('게이트', 6), ('의혹', 6), ('비주류', 5), ('관련', 4), (

In [61]:
# 최순실 이라는 단어는 없고, 최씨, 최순실씨만 있음
# get_bow(2778, x, -1)