In [27]:
import config

from soynlp.utils import DoublespaceLineCorpus
from soynlp.vectorizer import sent_to_word_contexts_matrix

corpus_path = config.data_directory + '/corpus_10days/news/2016-10-20_article_all_normed_ltokenize.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)

Create (word, contexts) matrix
  - counting word frequency from 223356 sents, mem=1.565 Gb
  - scanning (word, context) pairs from 223356 sents, mem=1.904 Gb
  - (word, context) matrix was constructed. shape = (36002, 36002)                    
  - done


In [2]:
x.shape

(36002, 36002)

    class Glove(object):
        """
        Class for estimating GloVe word embeddings using the
        corpus coocurrence matrix.
        """

        def fit(self, matrix, epochs=5, no_threads=2, verbose=False):
            """
            Estimate the word embeddings.
            Parameters:
            - scipy.sparse.coo_matrix matrix: coocurrence matrix
            - int epochs: number of training epochs
            - int no_threads: number of training threads
            - bool verbose: print progress messages if True
            """
            ...


In [28]:
%%time
from glove import Glove

glove = Glove(no_components=100, learning_rate=0.05, max_count=30)
glove.fit(x.tocoo(), epochs=5, no_threads=4, verbose=True)

Performing 5 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
CPU times: user 1min 51s, sys: 16 ms, total: 1min 51s
Wall time: 28.8 s


    class Glove(object):
        """
        Class for estimating GloVe word embeddings using the
        corpus coocurrence matrix.
        """

        def add_dictionary(self, dictionary):
            """
            Supply a word-id dictionary to allow similarity queries.
            """



In [29]:
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

In [30]:
from pprint import pprint

words = '아이오아이 아프리카 밴쯔 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    similars = glove.most_similar(word, number=10)
    for sim_word, sim in similars:
        print(' - {} ({:f})'.format(sim_word, sim))


아이오아이
 - 빅브레인 (0.825536)
 - 에이핑크 (0.755784)
 - 샤이니 (0.742245)
 - 신용재 (0.738495)
 - 63 (0.735941)
 - 흡연자 (0.724316)
 - 변호사 (0.720277)
 - 49가구 (0.715663)
 - 47 (0.713656)

아프리카
 - 인도네시아 (0.788612)
 - 터키 (0.786974)
 - 위원회 (0.758486)
 - 관리관 (0.756677)
 - 의회 (0.751656)
 - 프랑스 (0.749682)
 - 아내 (0.740632)
 - 지자체 (0.737634)
 - 독일 (0.735503)

밴쯔
 - 마이크로소프트 (0.881536)
 - 평론가 (0.873967)
 - 러에코 (0.862367)
 - 부산교통공사 (0.860085)
 - 전소미 (0.857028)
 - 왕씨 (0.851574)
 - 주택금융공사 (0.844223)
 - 스트레인지 (0.840058)
 - 컵스 (0.836951)

박근혜
 - 백승렬 (0.837885)
 - 대통령 (0.813706)
 - 모아소아 (0.790270)
 - 강지우 (0.788004)
 - 김준배 (0.768029)
 - 박준형 (0.764778)
 - 집현실 (0.754623)
 - 정 (0.751644)
 - 영상공모 (0.751037)

뉴스
 - 현입니다 (0.927284)
 - 돈이 (0.897314)
 - 리얼타임 (0.845308)
 - 정시내 (0.844449)
 - 보이는 (0.841029)
 - 머니 (0.816798)
 - 가치 (0.806059)
 - 마이데일리 (0.802414)
 - 머니투데이 (0.792055)

날씨
 - 쌀쌀 (0.724462)
 - 담배 (0.719716)
 - 이어지겠습니다 (0.702675)
 - 추운 (0.692823)
 - 갑가량 (0.679861)
 - 줄기 (0.647080)
 - 용어 (0.635079)
 - 상륙 (0.631704)
 - 시나리

In [31]:
print(glove.word_vectors.shape)
print(type(glove.word_vectors))

(36002, 100)
<class 'numpy.ndarray'>


In [32]:
print(glove.word_biases.shape)
print(type(glove.word_biases))

(36002,)
<class 'numpy.ndarray'>


In [33]:
import numpy as np

vocab_frequency = np.asarray(x.sum(axis=0)).reshape(-1)
print(type(vocab_frequency))
print(vocab_frequency.shape)

<class 'numpy.ndarray'>
(36002,)


In [34]:
import scipy as sp
sp.stats.pearsonr(glove.word_biases, vocab_frequency)

(0.1768017150356146, 1.3044250082638122e-250)

In [35]:
sp.stats.pearsonr(glove.word_biases, np.log(vocab_frequency + 1))

(0.43208580547485437, 0.0)

In [36]:
bin_size = 200
num_bin = 20

for i in range(num_bin):
    b = i * bin_size
    e = (i + 1) * bin_size
    indices = vocab_frequency.argsort()[::-1][b:e]
    corr, p_value = sp.stats.pearsonr(
        glove.word_biases[indices],
        np.log(vocab_frequency[indices])
    )
    print('top {} ~ {} frequent words = {:f}'.format(
        b, e, corr))

correlation of 0 ~ 200 frequent words = 0.842055
correlation of 200 ~ 400 frequent words = 0.000998
correlation of 400 ~ 600 frequent words = 0.043573
correlation of 600 ~ 800 frequent words = 0.101054
correlation of 800 ~ 1000 frequent words = 0.015101
correlation of 1000 ~ 1200 frequent words = -0.034661
correlation of 1200 ~ 1400 frequent words = -0.125970
correlation of 1400 ~ 1600 frequent words = 0.048151
correlation of 1600 ~ 1800 frequent words = 0.080136
correlation of 1800 ~ 2000 frequent words = -0.071327
correlation of 2000 ~ 2200 frequent words = 0.074374
correlation of 2200 ~ 2400 frequent words = 0.076970
correlation of 2400 ~ 2600 frequent words = 0.009619
correlation of 2600 ~ 2800 frequent words = 0.093729
correlation of 2800 ~ 3000 frequent words = -0.065978
correlation of 3000 ~ 3200 frequent words = 0.041012
correlation of 3200 ~ 3400 frequent words = 0.073790
correlation of 3400 ~ 3600 frequent words = 0.120550
correlation of 3600 ~ 3800 frequent words = 0.000758


## Cooccurrence vs PMI

In [37]:
corpus_path = config.data_directory + '/corpus_10days/news/2016-10-20_article_all_normed_ltokenize.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)

from soynlp.word import pmi

pmi_dok = pmi(
    x,
    min_pmi=0,
    alpha=0.0001,
    verbose=True)

Create (word, contexts) matrix
  - counting word frequency from 223356 sents, mem=1.603 Gb
  - scanning (word, context) pairs from 223356 sents, mem=1.933 Gb
  - (word, context) matrix was constructed. shape = (36002, 36002)                    
  - done
computing pmi was done                              


In [13]:
pmi_coo = pmi_dok.tocoo()
pmi_coo.data = np.exp(pmi_coo.data)

In [38]:
glove = Glove(no_components=100, learning_rate=0.05, max_count=3)
glove.fit(pmi_coo, epochs=10, no_threads=4, verbose=True)

dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

Performing 10 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


In [39]:
words = '아이오아이 아프리카 밴쯔 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    similars = glove.most_similar(word, number=10)
    for sim_word, sim in similars:
        print(' - {} ({:f})'.format(sim_word, sim))


아이오아이
 - 세븐 (0.821385)
 - 에이핑크 (0.818971)
 - 몬스타엑스 (0.787898)
 - 보이그룹 (0.764891)
 - 조해진 (0.752306)
 - 오블리스 (0.748496)
 - 에일리 (0.747067)
 - 익산 (0.745284)
 - ㅣ이정아 (0.744192)

아프리카
 - 태평양지사 (0.697227)
 - 한번씩 (0.688202)
 - 넘기고 (0.686094)
 - 태평양 (0.685456)
 - 부천 (0.683222)
 - 22억원 (0.678707)
 - 사이언스 (0.678692)
 - 바닷가 (0.667071)
 - 찾았던 (0.665898)

밴쯔
 - 분양광고 (0.966198)
 - 프라다 (0.958264)
 - 30만명 (0.952945)
 - 취득세 (0.949870)
 - 기억상실 (0.946373)
 - 심씨 (0.943394)
 - 상표권 (0.939953)
 - 탐구 (0.936034)
 - 계열회사 (0.930536)

박근혜
 - 역적패당 (0.589079)
 - 주체위성들 (0.588547)
 - 대통령 (0.580765)
 - 정권 (0.565015)
 - 내자 (0.516892)
 - 취임식 (0.510350)
 - 노무현 (0.506976)
 - 가소로운 (0.490007)
 - 채송무기자 (0.486237)

뉴스
 - 기다립니다 (0.755410)
 - 머니투데이 (0.658422)
 - 리얼타임 (0.644828)
 - 가치 (0.625832)
 - 뉴미디어 (0.599867)
 - 마이데일리 (0.563720)
 - 보이는 (0.555548)
 - 화제성 (0.550258)
 - 미란다 (0.533638)

날씨
 - 이어지겠습니다 (0.801522)
 - 불어오는 (0.668363)
 - 더운 (0.642125)
 - 쌀쌀 (0.637030)
 - 맑고 (0.631879)
 - 맑은 (0.606140)
 - 선선 (0.580214)
 - 완연한 (0.5771

## input is X or log X?

In [40]:
x_log = x.copy()
x_log.data = np.log(x_log.data)

In [41]:
%%time
from glove import Glove

glove_log = Glove(no_components=100, learning_rate=0.05, max_count=3)
glove_log.fit(x_log.tocoo(), epochs=5, no_threads=4, verbose=True)

Performing 5 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
CPU times: user 1min 51s, sys: 16 ms, total: 1min 51s
Wall time: 28.8 s


In [42]:
glove_log.add_dictionary(dictionary)

In [43]:
from pprint import pprint

words = '아이오아이 아프리카 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    similars = glove.most_similar(word, number=10)
    for sim_word, sim in similars:
        print(' - {} ({:f})'.format(sim_word, sim))


아이오아이
 - 세븐 (0.821385)
 - 에이핑크 (0.818971)
 - 몬스타엑스 (0.787898)
 - 보이그룹 (0.764891)
 - 조해진 (0.752306)
 - 오블리스 (0.748496)
 - 에일리 (0.747067)
 - 익산 (0.745284)
 - ㅣ이정아 (0.744192)

아프리카
 - 태평양지사 (0.697227)
 - 한번씩 (0.688202)
 - 넘기고 (0.686094)
 - 태평양 (0.685456)
 - 부천 (0.683222)
 - 22억원 (0.678707)
 - 사이언스 (0.678692)
 - 바닷가 (0.667071)
 - 찾았던 (0.665898)

박근혜
 - 역적패당 (0.589079)
 - 주체위성들 (0.588547)
 - 대통령 (0.580765)
 - 정권 (0.565015)
 - 내자 (0.516892)
 - 취임식 (0.510350)
 - 노무현 (0.506976)
 - 가소로운 (0.490007)
 - 채송무기자 (0.486237)

뉴스
 - 기다립니다 (0.755410)
 - 머니투데이 (0.658422)
 - 리얼타임 (0.644828)
 - 가치 (0.625832)
 - 뉴미디어 (0.599867)
 - 마이데일리 (0.563720)
 - 보이는 (0.555548)
 - 화제성 (0.550258)
 - 미란다 (0.533638)

날씨
 - 이어지겠습니다 (0.801522)
 - 불어오는 (0.668363)
 - 더운 (0.642125)
 - 쌀쌀 (0.637030)
 - 맑고 (0.631879)
 - 맑은 (0.606140)
 - 선선 (0.580214)
 - 완연한 (0.577185)
 - 보이겠습니다 (0.565806)

이화여대
 - 입학 (0.657255)
 - 이대 (0.651385)
 - 모모영화관 (0.631653)
 - 정유라씨 (0.622455)
 - 아트하우스 (0.619704)
 - 총장 (0.589111)
 - 특혜 (0.585906)
 - 정유연 (0.

## Use only noun

In [44]:
corpus_path = config.data_directory + 'corpus_10days/news/2016-10-20_article_all_normed_nountokenized.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)

x.shape

Create (word, contexts) matrix
  - counting word frequency from 30001 sents, mem=1.610 Gb
  - scanning (word, context) pairs from 30001 sents, mem=1.877 Gb
  - (word, context) matrix was constructed. shape = (24907, 24907)                    
  - done


(24907, 24907)

In [45]:
%%time
from glove import Glove

glove = Glove(no_components=100, learning_rate=0.05, max_count=30)
glove.fit(x.tocoo(), epochs=5, no_threads=4, verbose=True)

Performing 5 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
CPU times: user 1min 43s, sys: 8 ms, total: 1min 43s
Wall time: 26.4 s


In [46]:
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

words = '아이오아이 아프리카 밴쯔 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    similars = glove.most_similar(word, number=10)
    for sim_word, sim in similars:
        print(' - {} ({:f})'.format(sim_word, sim))


아이오아이
 - 신용재 (0.788213)
 - 완전체 (0.783201)
 - 너무너무너무 (0.746413)
 - 성진환 (0.661771)
 - 에이핑크 (0.653405)
 - 정채연 (0.651380)
 - 공포증 (0.614557)
 - 몬스타엑스 (0.600836)
 - 김규 (0.600183)

아프리카
 - 밴쯔 (0.764979)
 - 동남아시아 (0.627443)
 - 댈러스 (0.618848)
 - 중동 (0.611323)
 - 뉴욕증시 (0.582824)
 - 자원봉사단 (0.582330)
 - 매체들 (0.574021)
 - 비상식량 (0.561443)
 - 현장경영 (0.558286)

밴쯔
 - 대도서관 (0.814754)
 - 아프리카 (0.764979)
 - 주간아이돌 (0.716317)
 - 관료 (0.699244)
 - 남미 (0.697823)
 - 바이어 (0.693456)
 - 중남미 (0.689812)
 - 이천시 (0.677001)
 - 캄보디아 (0.674063)

박근혜
 - 역적패당 (0.873995)
 - 대통령 (0.788461)
 - 2002년 (0.731508)
 - 취임식 (0.728809)
 - 비선 (0.717803)
 - 방북 (0.712427)
 - 핵심사업 (0.703182)
 - 노무현 (0.703076)
 - 전진 (0.686775)

뉴스
 - 미란다 (0.896527)
 - 여러분 (0.883907)
 - 마이데일리 (0.858831)
 - 제보 (0.835693)
 - 리얼 (0.820783)
 - 취재원과 (0.818968)
 - 공감 (0.812822)
 - 721 (0.811476)
 - 1105 (0.800457)

날씨
 - 쌀쌀 (0.841931)
 - 추운 (0.828799)
 - 강원영동 (0.633724)
 - 아침 (0.627951)
 - 대체 (0.618444)
 - 선선 (0.617151)
 - 새벽 (0.601603)
 - 완연 (0.594135)
 - 가을 (

In [47]:
import numpy as np

vocab_frequency = np.asarray(x.sum(axis=0)).reshape(-1)
print(type(vocab_frequency))
print(vocab_frequency.shape)

<class 'numpy.ndarray'>
(24907,)


In [48]:
bin_size = 200
num_bin = 20

for i in range(num_bin):
    b = i * bin_size
    e = (i + 1) * bin_size
    indices = vocab_frequency.argsort()[::-1][b:e]
    corr, p_value = sp.stats.pearsonr(
        glove.word_biases[indices],
        np.log(vocab_frequency[indices])
    )
    print('correlation of {} ~ {} frequent words = {:f}'.format(
        b, e, corr))

correlation of 0 ~ 200 frequent words = 0.540041
correlation of 200 ~ 400 frequent words = 0.026582
correlation of 400 ~ 600 frequent words = 0.107916
correlation of 600 ~ 800 frequent words = 0.078556
correlation of 800 ~ 1000 frequent words = -0.015475
correlation of 1000 ~ 1200 frequent words = 0.110209
correlation of 1200 ~ 1400 frequent words = 0.151474
correlation of 1400 ~ 1600 frequent words = 0.095489
correlation of 1600 ~ 1800 frequent words = -0.063102
correlation of 1800 ~ 2000 frequent words = 0.084501
correlation of 2000 ~ 2200 frequent words = 0.055400
correlation of 2200 ~ 2400 frequent words = 0.084159
correlation of 2400 ~ 2600 frequent words = 0.007878
correlation of 2600 ~ 2800 frequent words = -0.006996
correlation of 2800 ~ 3000 frequent words = 0.124360
correlation of 3000 ~ 3200 frequent words = -0.019789
correlation of 3200 ~ 3400 frequent words = 0.053977
correlation of 3400 ~ 3600 frequent words = -0.050349
correlation of 3600 ~ 3800 frequent words = 0.189509