In [1]:
import config
from config import corpus_path

import wordvec_infer
from wordvec_infer import sents_to_word_contexts_matrix
from wordvec_infer import sents_to_unseen_word_contexts_matrix
from wordvec_infer import get_process_memory
from wordvec_infer import train_pmi
from wordvec_infer import Word2Vec

## From string to word - context matrix

In [2]:
import soynlp
from soynlp.utils import DoublespaceLineCorpus

corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
len(corpus)

30002

In [3]:
def my_tokenizer(sent, passwords={'아이오아이'}):
    words = [word for word in sent.split() if not (word in passwords)]
    return words

In [4]:
x, idx2vocab = sents_to_word_contexts_matrix(corpus, tokenizer=my_tokenizer)

Create (word, contexts) matrix
  - counting word frequency from 30001 sents, mem=0.089 Gb #vocabs = 24906
  - scanning (word, context) pairs from 30001 sents, mem=0.466 Gb
  - (word, context) matrix was constructed. shape = (24906, 24906)                    
  - done


In [5]:
pmi, px = train_pmi(x, as_csr=True, verbose=True)

In [6]:
from sklearn.metrics import pairwise_distances

vocab2idx = {vocab:idx for idx, vocab in enumerate(idx2vocab)}

def most_similar(word, query_vec=None, topk=10):
    if not (word in vocab2idx):
        return []

    query_idx = vocab2idx[word]

    if query_vec == None:
        query_vec = pmi[query_idx, :]
        #query_vec = y[query_idx].reshape(1,-1)

    dist = pairwise_distances(query_vec, pmi, metric='cosine')[0]    
    #dist = pairwise_distances(query_vec, y, metric='cosine')[0]
    similars = []
    for similar_idx in dist.argsort():
        if similar_idx == query_idx:
            continue

        if len(similars) >= topk:
            break

        similar_word = idx2vocab[similar_idx]
        similars.append((similar_word, 1-dist[similar_idx]))

    return similars

In [7]:
most_similar('아이오아이')

[]

In [8]:
most_similar('너무너무너무')

[('박진영', 0.44681138124242525),
 ('완전체', 0.41005233807102337),
 ('전소미', 0.38464430215667345),
 ('타이틀곡', 0.32795140778493626),
 ('엠카운트다운', 0.3060834033627373),
 ('잠깐', 0.3048808527425122),
 ('수록곡', 0.30083062875988964),
 ('중독성', 0.2627468512304537),
 ('상큼', 0.25309509167253696),
 ('안무', 0.25293617731426754)]

In [9]:
x_, idx2vocab_ = sents_to_unseen_word_contexts_matrix(
    corpus, {'아이오아이'}, vocab2idx)

Create (unseen word, contexts) matrix
  - counting word frequency from 30001 sents, mem=0.368 Gb #vocabs = 24907
  - scanning (word, context) pairs from 30001 sents, mem=0.368 Gb
  - (word, context) matrix was constructed. shape = (24907, 23638)                    
  - done


In [10]:
x_.shape

(1, 24906)

In [14]:
pmi_, _ = train_pmi(x_, px, as_csr=True, verbose=True)

In [11]:
vocab2idx['아이오아이'] = len(vocab2idx)

In [15]:
pmi_

<1x24906 sparse matrix of type '<class 'numpy.float64'>'
	with 411 stored elements in Compressed Sparse Row format>

In [16]:
most_similar('아이오아이', query_vec=pmi_, topk=10)

[('너무너무너무', 0.49239779565031516),
 ('엠카운트다운', 0.49202847947770345),
 ('완전체', 0.4428418916283481),
 ('신용재', 0.40748588636187133),
 ('전소미', 0.3580439186104495),
 ('타이틀곡', 0.34539111085223295),
 ('엠넷', 0.31874386494750506),
 ('박진영', 0.31090804599849897),
 ('오블리스', 0.3072393400941481),
 ('컴백', 0.28942854144841923)]

## get components

In [6]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
y = svd.fit_transform(pmi)

In [7]:
y.shape, pmi.shape, svd.components_.shape

((24906, 100), (24906, 24906), (100, 24906))

In [9]:
from sklearn.utils.extmath import safe_sparse_dot

y_ = safe_sparse_dot(pmi, svd.components_.T)

## Package test

In [4]:
word2vec = Word2Vec(tokenizer=my_tokenizer)
word2vec.train(corpus)

Create (word, contexts) matrix
  - counting word frequency from 30001 sents, mem=0.089 Gb #vocabs = 24906
  - scanning (word, context) pairs from 30001 sents, mem=0.466 Gb
  - (word, context) matrix was constructed. shape = (24906, 24906)                    
  - done
Training PMI ... done
Training SVD ... done


In [5]:
word2vec.most_similar('너무너무너무')

[('신용재', 0.9270464973913624),
 ('완전체', 0.914970384426004),
 ('타이틀곡', 0.905864914102968),
 ('엠카운트다운', 0.9041150133215388),
 ('백퍼센트', 0.9014783849405605),
 ('몬스타엑스', 0.9010203753612409),
 ('곡으로', 0.8990371268486782),
 ('안무', 0.8907120459528796),
 ('박진영', 0.8871723098381121),
 ('신곡', 0.8833824952633795)]

In [6]:
word2vec.most_similar('아이오아이')

[]

In [7]:
wordvec, index = word2vec.infer(
    corpus,
    words={'아이오아이'},
    append=True,
    tokenizer=lambda x:x.split()
)

Create (unseen word, contexts) matrix
  - counting word frequency from 30001 sents, mem=0.303 Gb #vocabs = 24907
  - scanning (word, context) pairs from 30001 sents, mem=0.303 Gb
  - (word, context) matrix was constructed. shape = (24907, 23772)                    
  - done
Training PMI ... done
Applying trained SVD ... done
vocabs : 24906 -> 24907


In [8]:
word2vec.most_similar('너무너무너무')

[('신용재', 0.9270464973913624),
 ('아이오아이', 0.9162263412577677),
 ('완전체', 0.914970384426004),
 ('타이틀곡', 0.905864914102968),
 ('엠카운트다운', 0.9041150133215388),
 ('백퍼센트', 0.9014783849405605),
 ('몬스타엑스', 0.9010203753612409),
 ('곡으로', 0.8990371268486782),
 ('안무', 0.8907120459528796),
 ('박진영', 0.8871723098381121)]

In [9]:
word2vec.most_similar('아이오아이')

[('엠카운트다운', 0.9243012341336443),
 ('엠넷', 0.9219115581331467),
 ('완전체', 0.91625257534599),
 ('너무너무너무', 0.9162263412577677),
 ('타이틀곡', 0.9074516014443481),
 ('몬스타엑스', 0.9061148638752767),
 ('멤버들', 0.9013150455703564),
 ('오블리스', 0.9005074700480684),
 ('신용재', 0.8961139817184961),
 ('백퍼센트', 0.8934708002132166)]