In [2]:
import sys
sys.path.append('/mnt/lovit/git/soynlp/')

from soynlp.utils import DoublespaceLineCorpus
from soynlp.vectorizer import sent_to_word_contexts_matrix

corpus_path = '/mnt/lovit/works/fastcampus_text_ml/2nd/data/corpus_10days/news/2016-10-20_article_all_normed_nountokenized.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)

Create (word, contexts) matrix
  - counting word frequency from 30001 sents, mem=0.085 Gb
  - scanning (word, context) pairs from 30001 sents, mem=0.601 Gb
  - (word, context) matrix was constructed. shape = (24907, 24907)                    
  - done


In [3]:
x.shape

(24907, 24907)

    class Glove(object):
        """
        Class for estimating GloVe word embeddings using the
        corpus coocurrence matrix.
        """

        def fit(self, matrix, epochs=5, no_threads=2, verbose=False):
            """
            Estimate the word embeddings.
            Parameters:
            - scipy.sparse.coo_matrix matrix: coocurrence matrix
            - int epochs: number of training epochs
            - int no_threads: number of training threads
            - bool verbose: print progress messages if True
            """
            ...


In [6]:
%%time
from glove import Glove

glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(x.tocoo(), epochs=10, no_threads=4, verbose=True)

Performing 10 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
CPU times: user 3min 24s, sys: 20 ms, total: 3min 24s
Wall time: 52.6 s


    class Glove(object):
        """
        Class for estimating GloVe word embeddings using the
        corpus coocurrence matrix.
        """

        def add_dictionary(self, dictionary):
            """
            Supply a word-id dictionary to allow similarity queries.
            """



In [7]:
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}

In [8]:
glove.add_dictionary(dictionary)

In [55]:
from pprint import pprint

words = '아이오아이 아프리카 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    pprint(glove.most_similar(word, number=10))


아이오아이
[('완전체', 0.8110422744510709),
 ('신용재', 0.7714571475478809),
 ('너무너무너무', 0.7215638366243295),
 ('개헌론자', 0.686157851347118),
 ('홈플러스', 0.6179901481976946),
 ('메르세데스', 0.6116926738502735),
 ('에이핑크', 0.5982829613872631),
 ('벤츠', 0.5979917733415333),
 ('산골', 0.5888233714205227)]

아프리카
[('밴쯔', 0.7313049966795623),
 ('대중소기업협력재단', 0.5966980195934859),
 ('자원봉사단', 0.5757705075858978),
 ('비상식량', 0.5669954560028978),
 ('아시아', 0.5379778598627764),
 ('방송사', 0.5369378429569682),
 ('태평양', 0.5357861391625623),
 ('홈쇼핑', 0.5354333151012627),
 ('연준', 0.5346106313898341)]

박근혜
[('역적패당', 0.8742352455386201),
 ('대통령', 0.8144689554935732),
 ('취임식', 0.7822032376162812),
 ('노무현', 0.7685164882471495),
 ('2002년', 0.753336645290518),
 ('포로', 0.753204440099081),
 ('핵심사업', 0.7405972261337707),
 ('방북', 0.735674823178534),
 ('이복형', 0.7290167256216834)]

뉴스
[('여러분', 0.8518878027673972),
 ('미란다', 0.8499598965893723),
 ('머니투데이', 0.8282909648811618),
 ('리얼', 0.8262764154211826),
 ('마이데일리', 0.8185270159522261),
 ('실

In [21]:
print(glove.word_vectors.shape)
print(type(glove.word_vectors))

(24907, 100)
<class 'numpy.ndarray'>


In [22]:
print(glove.word_biases.shape)
print(type(glove.word_biases))

(24907,)
<class 'numpy.ndarray'>


In [32]:
import numpy as np

vocab_frequency = np.asarray(x.sum(axis=0)).reshape(-1)
print(type(vocab_frequency))
print(vocab_frequency.shape)

<class 'numpy.ndarray'>
(24907,)


In [33]:
import scipy as sp
sp.stats.pearsonr(glove.word_biases, vocab_frequency)

(0.22297902936046934, 3.5819032631339966e-278)

In [34]:
sp.stats.pearsonr(glove.word_biases, np.log(vocab_frequency))

(0.43664360593289386, 0.0)

In [47]:
bin_size = 200
num_bin = 20

for i in range(num_bin):
    b = i * bin_size
    e = (i + 1) * bin_size
    indices = vocab_frequency.argsort()[::-1][b:e]
    corr, p_value = sp.stats.pearsonr(
        glove.word_biases[indices],
        np.log(vocab_frequency[indices])
    )
    print('correlation of {} ~ {} frequent words = {:f}'.format(
        b, e, corr))

correlation of 0 ~ 200 frequent words = 0.575500
correlation of 200 ~ 400 frequent words = 0.018898
correlation of 400 ~ 600 frequent words = 0.110320
correlation of 600 ~ 800 frequent words = 0.089869
correlation of 800 ~ 1000 frequent words = -0.031591
correlation of 1000 ~ 1200 frequent words = 0.113555
correlation of 1200 ~ 1400 frequent words = 0.164107
correlation of 1400 ~ 1600 frequent words = 0.088576
correlation of 1600 ~ 1800 frequent words = -0.069694
correlation of 1800 ~ 2000 frequent words = 0.113063
correlation of 2000 ~ 2200 frequent words = 0.075235
correlation of 2200 ~ 2400 frequent words = 0.085103
correlation of 2400 ~ 2600 frequent words = 0.004857
correlation of 2600 ~ 2800 frequent words = -0.013375
correlation of 2800 ~ 3000 frequent words = 0.122389
correlation of 3000 ~ 3200 frequent words = -0.023897
correlation of 3200 ~ 3400 frequent words = 0.045011
correlation of 3400 ~ 3600 frequent words = -0.060463
correlation of 3600 ~ 3800 frequent words = 0.179952

In [48]:
x_log = x.copy()
x_log.data = np.log(x_log.data)

In [49]:
%%time
from glove import Glove

glove_log = Glove(no_components=100, learning_rate=0.05)
glove_log.fit(x_log.tocoo(), epochs=10, no_threads=4, verbose=True)

Performing 10 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
CPU times: user 3min 20s, sys: 32 ms, total: 3min 20s
Wall time: 51.6 s


In [58]:
glove_log.add_dictionary(dictionary)

In [59]:
from pprint import pprint

words = '아이오아이 아프리카 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    pprint(glove_log.most_similar(word, number=10))


아이오아이
[('저유가', 0.6763766934512719),
 ('자활사업', 0.6762899551714665),
 ('개막식', 0.6748371592492104),
 ('24년', 0.6708396305297788),
 ('금융투자업계', 0.666910025191982),
 ('김장', 0.6658309910286825),
 ('희현', 0.6607676843999316),
 ('두산그룹', 0.6577247539629182),
 ('결실', 0.6574939949167108)]

아프리카
[('토지', 0.5250511879809441),
 ('여론조사', 0.5077297609615892),
 ('여행주간', 0.5037917226552694),
 ('심은경', 0.4987474151148324),
 ('모색', 0.4958612904573909),
 ('후원', 0.49222607794925727),
 ('이대호', 0.49202697420582975),
 ('2000', 0.4898551354924104),
 ('국내외', 0.4795965814975639)]

박근혜
[('제기', 0.717966820803743),
 ('얘기', 0.7137186259032721),
 ('그래서', 0.7115330709333866),
 ('경제', 0.7073272566137323),
 ('시대', 0.7066019149574528),
 ('실제', 0.7055290121013276),
 ('미국', 0.6970868537549029),
 ('있다고', 0.6968010651676112),
 ('중요', 0.6966457843195364)]

뉴스
[('자세', 0.7091585480589409),
 ('화제성이', 0.701761400919492),
 ('수가', 0.6902325753881542),
 ('한다고', 0.6726047361489028),
 ('기회', 0.6620048779251585),
 ('소개', 0.6620014728778199