In [1]:
import config

from soynlp.utils import DoublespaceLineCorpus
from soynlp.vectorizer import sent_to_word_contexts_matrix

corpus_path = config.data_directory + '/corpus_10days/news/2016-10-20_article_all_normed_ltokenize.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)

Create (word, contexts) matrix
  - counting word frequency from 223356 sents, mem=0.098 Gb
  - scanning (word, context) pairs from 223356 sents, mem=0.692 Gb
  - (word, context) matrix was constructed. shape = (36002, 36002)                    
  - done


In [2]:
x.shape

(36002, 36002)

    class Glove(object):
        """
        Class for estimating GloVe word embeddings using the
        corpus coocurrence matrix.
        """

        def fit(self, matrix, epochs=5, no_threads=2, verbose=False):
            """
            Estimate the word embeddings.
            Parameters:
            - scipy.sparse.coo_matrix matrix: coocurrence matrix
            - int epochs: number of training epochs
            - int no_threads: number of training threads
            - bool verbose: print progress messages if True
            """
            ...


In [3]:
%%time
from glove import Glove

glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(x.tocoo(), epochs=30, no_threads=4, verbose=True)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
CPU times: user 11min 19s, sys: 144 ms, total: 11min 19s
Wall time: 2min 54s


    class Glove(object):
        """
        Class for estimating GloVe word embeddings using the
        corpus coocurrence matrix.
        """

        def add_dictionary(self, dictionary):
            """
            Supply a word-id dictionary to allow similarity queries.
            """



In [4]:
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

In [6]:
from pprint import pprint

words = '아이오아이 아프리카 밴쯔 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    pprint(glove.most_similar(word, number=10))


아이오아이
[('신용재', 0.6330379191527024),
 ('빅브레인', 0.5123793494284452),
 ('전소미', 0.5114836971245029),
 ('춘천시', 0.5013883818565743),
 ('남남서쪽', 0.499423588901108),
 ('교수협의회', 0.48165639085548845),
 ('에이핑크', 0.4751089790222733),
 ('스카이컨벤션웨딩', 0.4639742051458616),
 ('너무너무너무', 0.462583507196116)]

아프리카
[('오세아니아', 0.5468469915590192),
 ('아시아', 0.523738394896139),
 ('밴쯔', 0.5164662410586703),
 ('지역', 0.5157234799324472),
 ('유럽', 0.500360932766298),
 ('맨스에비뉴', 0.4916878088569188),
 ('서해안', 0.48839876263481596),
 ('연준은행', 0.484459463846144),
 ('태평양', 0.47942268656046266)]

밴쯔
[('페란테', 0.6780002382000481),
 ('히들스턴', 0.6550006153192783),
 ('역북지구', 0.6537082697322909),
 ('왕씨', 0.6506106387525962),
 ('구윤재', 0.6441480886063712),
 ('러에코', 0.6434737034372586),
 ('제보자', 0.6405363650424534),
 ('고어', 0.6350559752278742),
 ('대너', 0.6302981702705076)]

박근혜
[('대통령', 0.7529166024278048),
 ('노무현', 0.6699908024338649),
 ('정권', 0.6686890352854226),
 ('백승렬', 0.6513448409075263),
 ('역적패당', 0.6403554473185094),
 ('정부'

In [7]:
print(glove.word_vectors.shape)
print(type(glove.word_vectors))

(36002, 100)
<class 'numpy.ndarray'>


In [8]:
print(glove.word_biases.shape)
print(type(glove.word_biases))

(36002,)
<class 'numpy.ndarray'>


In [9]:
import numpy as np

vocab_frequency = np.asarray(x.sum(axis=0)).reshape(-1)
print(type(vocab_frequency))
print(vocab_frequency.shape)

<class 'numpy.ndarray'>
(36002,)


In [10]:
import scipy as sp
sp.stats.pearsonr(glove.word_biases, vocab_frequency)

(0.13917726082382556, 3.717865648249997e-155)

In [11]:
sp.stats.pearsonr(glove.word_biases, np.log(vocab_frequency + 1))

(0.13022318865789767, 6.383992827394151e-136)

In [12]:
bin_size = 200
num_bin = 20

for i in range(num_bin):
    b = i * bin_size
    e = (i + 1) * bin_size
    indices = vocab_frequency.argsort()[::-1][b:e]
    corr, p_value = sp.stats.pearsonr(
        glove.word_biases[indices],
        np.log(vocab_frequency[indices])
    )
    print('correlation of {} ~ {} frequent words = {:f}'.format(
        b, e, corr))

correlation of 0 ~ 200 frequent words = 0.903466
correlation of 200 ~ 400 frequent words = 0.039069
correlation of 400 ~ 600 frequent words = 0.063367
correlation of 600 ~ 800 frequent words = 0.082983
correlation of 800 ~ 1000 frequent words = -0.063851
correlation of 1000 ~ 1200 frequent words = -0.044165
correlation of 1200 ~ 1400 frequent words = -0.105217
correlation of 1400 ~ 1600 frequent words = 0.026320
correlation of 1600 ~ 1800 frequent words = 0.057185
correlation of 1800 ~ 2000 frequent words = -0.094153
correlation of 2000 ~ 2200 frequent words = 0.028439
correlation of 2200 ~ 2400 frequent words = 0.077549
correlation of 2400 ~ 2600 frequent words = -0.001549
correlation of 2600 ~ 2800 frequent words = 0.072927
correlation of 2800 ~ 3000 frequent words = -0.029316
correlation of 3000 ~ 3200 frequent words = 0.013317
correlation of 3200 ~ 3400 frequent words = 0.089424
correlation of 3400 ~ 3600 frequent words = 0.100484
correlation of 3600 ~ 3800 frequent words = 0.01968

## Cooccurrence vs PMI

In [24]:
corpus_path = config.data_directory + '/corpus_10days/news/2016-10-20_article_all_normed_ltokenize.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)

from soynlp.word import pmi

pmi_dok = pmi(
    x,
    min_pmi=0,
    alpha=0.0001,
    verbose=True)

Create (word, contexts) matrix
  - counting word frequency from 223356 sents, mem=0.762 Gb
  - scanning (word, context) pairs from 223356 sents, mem=1.148 Gb
  - (word, context) matrix was constructed. shape = (36002, 36002)                    
  - done
computing pmi was done                              


In [28]:
pmi_coo = pmi_dok.tocoo()
pmi_coo.data = np.exp(pmi_coo.data)

In [29]:
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(pmi_coo, epochs=10, no_threads=4, verbose=True)

dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

Performing 10 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


In [30]:
words = '아이오아이 아프리카 밴쯔 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    pprint(glove.most_similar(word, number=10))


아이오아이
[('신용재', 0.7914284209632875),
 ('세븐', 0.7489299970993364),
 ('완전체', 0.71126339593965),
 ('빅브레인', 0.6741679949075243),
 ('너무너무너무', 0.6738632698821577),
 ('바스타즈', 0.6465278467299003),
 ('전소미', 0.6440235347213449),
 ('에스에프나인', 0.6310242247797081),
 ('최유정', 0.6294763752012386)]

아프리카
[('오세아니아', 0.7241348027333279),
 ('밴쯔', 0.7018811286614235),
 ('발톱', 0.6872967933505756),
 ('순방', 0.6829228441116788),
 ('남미', 0.671354948555566),
 ('대기오염', 0.6670380713287991),
 ('중남미', 0.652867547635567),
 ('연수', 0.6428026341834174),
 ('연합군', 0.6420441710452911)]

밴쯔
[('괴담', 0.843470048705245),
 ('출신들', 0.8312542079825843),
 ('동국대학교', 0.8267818230809405),
 ('대도서관', 0.82376742132087),
 ('발톱', 0.8229074783810726),
 ('어우러지는', 0.8208860999048154),
 ('운용사들', 0.8173702382642954),
 ('청년위원회', 0.8173343666943348),
 ('정시내', 0.8165243983617699)]

박근혜
[('백승렬', 0.7634579650936018),
 ('뉴시스', 0.685499446709774),
 ('전진', 0.6719308223698037),
 ('역적패당', 0.6608761405192142),
 ('최진석', 0.632234850897908),
 ('취임식', 0.62923

## input is X or log X?

In [13]:
x_log = x.copy()
x_log.data = np.log(x_log.data)

In [14]:
%%time
from glove import Glove

glove_log = Glove(no_components=100, learning_rate=0.05)
glove_log.fit(x_log.tocoo(), epochs=30, no_threads=4, verbose=True)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
CPU times: user 11min 4s, sys: 168 ms, total: 11min 4s
Wall time: 2min 50s


In [15]:
glove_log.add_dictionary(dictionary)

In [16]:
from pprint import pprint

words = '아이오아이 아프리카 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    pprint(glove_log.most_similar(word, number=10))


아이오아이
[('토크쇼', 0.714453936019567),
 ('정보위', 0.7036633406860543),
 ('마주', 0.7032775745889824),
 ('알레포', 0.6726947920305449),
 ('크루즈', 0.6585095182682378),
 ('중학교', 0.6554945811709223),
 ('경계', 0.6521894532401543),
 ('일방적', 0.6479916391944842),
 ('운영위원회', 0.6432933923521571)]

아프리카
[('도를', 0.8322087201781329),
 ('보수', 0.8032509461006283),
 ('급등', 0.795653345410944),
 ('변호사', 0.7941358153127909),
 ('서명', 0.7846240109991088),
 ('기조', 0.7843483922600653),
 ('대피', 0.7781592999464791),
 ('핵무기', 0.7769994447356362),
 ('물가', 0.7761496793459782)]

박근혜
[('매각', 0.4404301725962579),
 ('살갑고', 0.4380179217211654),
 ('독일', 0.40669228538130314),
 ('호', 0.4045846109016733),
 ('조직', 0.4042742124774094),
 ('개봉', 0.40327956421652233),
 ('야당', 0.4024726352164049),
 ('6시', 0.393807253461551),
 ('재단', 0.39283910729643584)]

뉴스
[('진입니다', 0.582575755753948),
 ('권리', 0.5579738691321225),
 ('것처럼', 0.49352934460591424),
 ('인프라', 0.4830418505515455),
 ('강남', 0.4822897994943563),
 ('한화', 0.46749752669912326),
 ('케미

## Use only noun

In [17]:
corpus_path = config.data_directory + 'corpus_10days/news/2016-10-20_article_all_normed_nountokenized.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)

x.shape

Create (word, contexts) matrix
  - counting word frequency from 30001 sents, mem=0.482 Gb
  - scanning (word, context) pairs from 30001 sents, mem=0.883 Gb
  - (word, context) matrix was constructed. shape = (24907, 24907)                    
  - done


(24907, 24907)

In [18]:
%%time
from glove import Glove

glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(x.tocoo(), epochs=50, no_threads=4, verbose=True)

Performing 50 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
CPU times: user 17min 24s, sys: 208 ms, total: 17min 24s
Wall time: 4min 28s


In [20]:
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

words = '아이오아이 아프리카 밴쯔 박근혜 뉴스 날씨 이화여대 아프리카발톱개구리'.split()
for word in words:
    print('\n{}'.format(word))
    pprint(glove.most_similar(word, number=10))


아이오아이
[('신용재', 0.5805331729320036),
 ('너무너무너무', 0.5741224744112465),
 ('완전체', 0.5726943482665132),
 ('정채연', 0.5021193024856118),
 ('엠카운트다운', 0.4530081679972609),
 ('상큼', 0.4245737037567245),
 ('전소미', 0.41846291947213665),
 ('세븐', 0.40930687997524856),
 ('에이핑크', 0.40125288862530734)]

아프리카
[('밴쯔', 0.7481097142457359),
 ('대도서관', 0.6069598956105416),
 ('남미', 0.5616162470321773),
 ('윰댕', 0.47132781819711234),
 ('중남미', 0.44717903427322764),
 ('태평양', 0.4393260339807186),
 ('특수관계자', 0.4368596996605185),
 ('아시아', 0.4131500263233034),
 ('30만명', 0.404283440142136)]

밴쯔
[('아프리카', 0.7481097142457359),
 ('대도서관', 0.5698692430747861),
 ('윰댕', 0.5521730368818649),
 ('남미', 0.5073757989578754),
 ('중남미', 0.5032227200370767),
 ('구주', 0.5027751709863969),
 ('특수관계자', 0.4916473575900808),
 ('게스트하우스', 0.4816951876666276),
 ('끌면서', 0.47871655211856484)]

박근혜
[('대통령', 0.7230460227501027),
 ('역적패당', 0.6157415625599676),
 ('비선', 0.6054084124585757),
 ('전진', 0.584424604313606),
 ('정권', 0.5754621621205595),
 ('200

In [21]:
import numpy as np

vocab_frequency = np.asarray(x.sum(axis=0)).reshape(-1)
print(type(vocab_frequency))
print(vocab_frequency.shape)

<class 'numpy.ndarray'>
(24907,)


In [22]:
bin_size = 200
num_bin = 20

for i in range(num_bin):
    b = i * bin_size
    e = (i + 1) * bin_size
    indices = vocab_frequency.argsort()[::-1][b:e]
    corr, p_value = sp.stats.pearsonr(
        glove.word_biases[indices],
        np.log(vocab_frequency[indices])
    )
    print('correlation of {} ~ {} frequent words = {:f}'.format(
        b, e, corr))

correlation of 0 ~ 200 frequent words = 0.550328
correlation of 200 ~ 400 frequent words = 0.114059
correlation of 400 ~ 600 frequent words = 0.159792
correlation of 600 ~ 800 frequent words = 0.111992
correlation of 800 ~ 1000 frequent words = -0.045210
correlation of 1000 ~ 1200 frequent words = 0.104710
correlation of 1200 ~ 1400 frequent words = 0.147151
correlation of 1400 ~ 1600 frequent words = 0.054132
correlation of 1600 ~ 1800 frequent words = -0.058026
correlation of 1800 ~ 2000 frequent words = 0.085568
correlation of 2000 ~ 2200 frequent words = 0.066846
correlation of 2200 ~ 2400 frequent words = 0.106371
correlation of 2400 ~ 2600 frequent words = 0.012924
correlation of 2600 ~ 2800 frequent words = -0.031278
correlation of 2800 ~ 3000 frequent words = 0.139948
correlation of 3000 ~ 3200 frequent words = 0.011712
correlation of 3200 ~ 3400 frequent words = 0.035133
correlation of 3400 ~ 3600 frequent words = -0.068364
correlation of 3600 ~ 3800 frequent words = 0.178145
