# Distributed Representation - word2vec
단어를 Continuous vector로 표현하는 방법인 word2vec (skip-gram)을 해봅니다. 실제 예제로 arXiv에서 scraping한 text mining 관련 논문의 초록의 단어들로 진행합니다.
  
* _nltk를 활용합니다._
* gensim을 활용합니다.
* nltk : http://www.nltk.org/book/
* gensim : https://radimrehurek.com/gensim/index.html

## word2vec
### Load modules

In [1]:
import os, sys
import nltk
import re
import pandas as pd
from gensim.models.word2vec import Word2Vec
os.listdir()

Slow version of gensim.models.doc2vec is being used


['.ipynb_checkpoints',
 'Distributed Representation (word2vec).ipynb',
 'Document Representation (term frequency, tf-idf).ipynb',
 'Scrapping text mining papers in arXiv.py',
 'Simple NLP for English.ipynb',
 'Simple NLP for Korean.ipynb',
 'text_mining_paper.csv']

### Load abstracts of text mining papers

In [2]:
papers = pd.read_csv('./text_mining_paper.csv', encoding = 'cp949')
papers.head()

Unnamed: 0,abstract,author,meta,subject,title
0,"The complicated, evolving landscape of cancer ...","Rocco Piazza, Daniele Ramazzotti, Roberta Spin...","Thu, 9 Mar 2017 01:24:23 GMT (948kb)",Genomics (q-bio.GN),"OncoScore: a novel, Internet-based tool to ass..."
1,"Mining textual patterns in news, tweets, paper...","Meng Jiang, Jingbo Shang, Taylor Cassidy, Xian...","Mon, 13 Mar 2017 01:06:19 GMT (1150kb,D) [v2] ...",Computation and Language (cs.CL),MetaPAD: Meta Pattern Discovery from Massive T...
2,This paper is a tutorial on Formal Concept Ana...,Dmitry I. Ignatov,"Wed, 8 Mar 2017 12:53:21 GMT (3541kb,D)",Information Retrieval (cs.IR),Introduction to Formal Concept Analysis and It...
3,Topic models have been widely used in discover...,"Jarvan Law, Hankz Hankui Zhuo, Junhua He, Erhu...","Thu, 23 Feb 2017 07:16:03 GMT (96kb,D)",Computation and Language (cs.CL),LTSG: Latent Topical Skip-Gram for Mutually Le...
4,Entity extraction is fundamental to many text ...,"Zeyi Wen, Dong Deng, Rui Zhang, Kotagiri Ramam...","Sun, 12 Feb 2017 12:46:40 GMT (89kb)",Databases (cs.DB),A Technical Report: Entity Extraction using Bo...


In [3]:
abstracts = list(papers['abstract'])

### Preprocessing
1. 2글자 이상의 영단어 추출, 모두 소문자로 변환
2. gensim의 Word2Vec class가 input으로 받을 수 있는 corpus 형태로 변환  
(nested list의 형태이며 큰 요소의 list가 되는 list는 순서가 바뀌지않은 token의 집합)

In [4]:
corpus = list(map(lambda x : re.findall('[A-z]{2,}',x.lower()), abstracts))

In [5]:
# 유사도 검증을 위해 token들의 집합을 하나 따로 떼어두기
from collections import Counter
tokens = sum(corpus, [])
tokens = Counter(tokens)
tokens = [token[0] for token in list(tokens.items()) if token[1] >= 2]
tokens[0:5]

['the', 'complicated', 'evolving', 'of', 'cancer']

###  Training word2vec
관련하여 자세한 옵션은 공식 문서를 참조할 것, 본 예제에서는 다음과 같은 parameter로 training  
참고 : https://radimrehurek.com/gensim/models/word2vec.html
1. 100차원의 벡터로 embedding
2. 초기 learning rate = 0.025
3. window size = 5
5. min_count = 2 (최소 2회이상 나타난 단어만)
4. skip-gram

In [6]:
config = {'size' :100, 'alpha' : 0.025, 'window' : 5, 'sg' : 1, 'min_count' : 2} 
# sg는 skip-gram 방법의 사용여부요 1이면 skip-gram, 0이면 CBOW
model = Word2Vec(sentences = corpus, **config)



In [7]:
model.init_sims(replace = True) #필요없는 메모리 unload

In [8]:
# 두 word간의 embedding 공간상의 유사도 계산
model.similarity(tokens[0], tokens[1])
print(tokens[0], tokens[1])

the complicated


In [9]:
# 가장 유사한 20개 단어 출력
model.most_similar(tokens[0], topn = 20)

[('each', 0.9989919066429138),
 ('number', 0.9989718794822693),
 ('problem', 0.9989573955535889),
 ('words', 0.9989050030708313),
 ('one', 0.9988996982574463),
 ('is', 0.9988754391670227),
 ('further', 0.9988439679145813),
 ('by', 0.9988322257995605),
 ('both', 0.9988305568695068),
 ('concepts', 0.9988234043121338),
 ('topics', 0.9988216757774353),
 ('content', 0.9988120198249817),
 ('these', 0.9987977147102356),
 ('system', 0.9987806081771851),
 ('web', 0.9987776279449463),
 ('them', 0.9987676739692688),
 ('process', 0.9987626075744629),
 ('not', 0.9987606406211853),
 ('features', 0.9987539649009705),
 ('result', 0.9987524747848511)]

In [10]:
# 단어간 관계 찾기
model.most_similar(positive = ['experimental', 'words'], negative = ['that'])

[('because', 0.9987838268280029),
 ('type', 0.9987679719924927),
 ('context', 0.9987528324127197),
 ('operations', 0.9987417459487915),
 ('up', 0.9987417459487915),
 ('attributes', 0.9987099766731262),
 ('detection', 0.9987073540687561),
 ('embedding', 0.9987009167671204),
 ('traditional', 0.998698353767395),
 ('was', 0.998693585395813)]

In [11]:
# word embedding이 산출된 word의 목록
print(model.wv.index2word[0:5])
print(len(model.wv.index2word))

['the', 'of', 'and', 'to', 'in']
2213


In [12]:
# 모든 word들의 embedding matrix 생성
import numpy as np
my_word = model.wv.index2word

In [13]:
embedding = [model[token] for token in my_word]
embedding[0:2]

[array([ 0.01988189,  0.13139629,  0.01627134, -0.0089903 ,  0.11221685,
         0.17640017,  0.00949641,  0.21792442, -0.15304418,  0.13018884,
         0.13977255, -0.16033855,  0.10094168, -0.10551531, -0.21017449,
         0.17190437, -0.04920886,  0.06440064,  0.07505376, -0.06977551,
         0.0194921 ,  0.02675535,  0.004789  , -0.03555432,  0.00410689,
         0.12984014, -0.08025134,  0.03181918,  0.06659508, -0.03575774,
        -0.00317442,  0.01312721, -0.10238264,  0.05749114,  0.10639635,
         0.03436963,  0.02878631,  0.0412176 ,  0.10581778,  0.02268814,
        -0.26732701, -0.18780009,  0.10348438,  0.11659094, -0.08257636,
        -0.09209964,  0.03796738,  0.08150806,  0.14146197,  0.03125246,
        -0.08332349,  0.0649891 ,  0.03386664, -0.08881352, -0.08040347,
         0.09037085, -0.09085443,  0.0528989 ,  0.06728957,  0.00124398,
         0.01503391,  0.04555376,  0.13048171, -0.06746729,  0.13963263,
        -0.07229198,  0.01079031,  0.03317163, -0.1

In [14]:
embedding = np.asarray(embedding)
embedding.shape

(2213, 100)