## Word2Vec

In [14]:
import re
from lxml import etree
import urllib.request
import zipfile
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leeseungjun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/GaoleMeng/RNN-and-FFNN-textClassification/master/ted_en-20160408.xml",
    filename="ted_en-20160408.xml",
)

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x169a2ead0>)

In [13]:
targetXML = open("ted_en-20160408.xml", "r", encoding="UTF8")
target_text = etree.parse(targetXML)

# xml 파일로부터 <content>와 </content> 사이의 내용만 가져온다.
parse_text = "\n".join(target_text.xpath("//content/text()"))

# 정규 표현식의 sub 모듈을 통해 content 중간에 등장하는 (Audio), (Laughter) 등의 배경음 부분을 제거.
# 해당 코드는 괄호로 구성된 내용을 제거.
content_text = re.sub(r"\([^)]*\)", "", parse_text)

### 데이터 살펴보기

In [15]:
content_text[:100]

"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\nTo m"

In [16]:
# 입력 코퍼스에 대해서 NLTK를 이용해 문장 토큰화 수행
sent_text = sent_tokenize(content_text)

In [17]:
sent_text[:3]

["Here are two reasons companies fail: they only do more of the same, or they only do what's new.",
 'To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation.',
 'Both are necessary, but it can be too much of a good thing.']

In [18]:
# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
normalized_text = []
for string in sent_text:
    tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
    normalized_text.append(tokens)

# 각 문장에 대해서 NLTK를 이용하여 단어 토큰화를 수행.
result = [word_tokenize(sentence) for sentence in normalized_text]

In [19]:
print(result[:3])

[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation'], ['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']]


### 영어 word2vec 훈련

In [22]:
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=result,  # corpus
    vector_size=100,  # 각 단어의 임베딩된 차원 => 단어 벡터의 크기
    window=5,  # context window 의 크기
    min_count=5,  # 단어 최소 빈도수 제한
    workers=4,  # 학습을 위한 cpu 갯수
    sg=1,  # skip gram 사용 여부 => 0 : CBOW , 1 : skip gram
)

In [23]:
# 단어에 대한 유사도 구하기
# most_simalar : cosine similarity
model.wv.most_similar("man")

[('woman', 0.7464737296104431),
 ('guy', 0.7192587852478027),
 ('soldier', 0.6922350525856018),
 ('rabbi', 0.6873827576637268),
 ('boy', 0.6782869100570679),
 ('son', 0.6768252849578857),
 ('adage', 0.6663954257965088),
 ('michelangelo', 0.6655485033988953),
 ('handsome', 0.6485349535942078),
 ('shepherd', 0.6447014808654785)]

In [24]:
model.wv.most_similar("woman")

[('girl', 0.7980387210845947),
 ('man', 0.7464736700057983),
 ('child', 0.743230938911438),
 ('lady', 0.7197898626327515),
 ('soldier', 0.7186397314071655),
 ('boy', 0.7042767405509949),
 ('parent', 0.7007882595062256),
 ('husband', 0.6846175789833069),
 ('son', 0.677384614944458),
 ('daughter', 0.6771542429924011)]

In [26]:
# 모델이 학습한 벡터를 확인
model.wv["man"]  # wv는 딕셔너리

array([-0.25788373, -0.44084692, -0.01848858, -0.41008654,  0.06461044,
       -0.00098784,  0.04530404,  0.27290663, -0.24753846,  0.04860221,
       -0.23623542, -0.40986082,  0.20925735,  0.2960284 , -0.33408472,
       -0.1963368 , -0.2462806 , -0.02860613,  0.05724161, -0.39650035,
        0.4787234 ,  0.0252343 ,  0.47673836, -0.04523695,  0.15992147,
       -0.22721612, -0.27062297, -0.22787645, -0.29539725, -0.18134838,
        0.2610867 ,  0.40853724,  0.22275853,  0.15678097,  0.00819362,
        0.25696972, -0.16722701,  0.00358507, -0.11458305, -0.03936708,
        0.02665408, -0.3851479 , -0.17574129, -0.06721475, -0.07976396,
       -0.38861525, -0.2516326 ,  0.07662547, -0.22281861,  0.08232789,
       -0.06064706, -0.46023953,  0.02285396,  0.36257708,  0.10212065,
       -0.0214293 , -0.15527436, -0.6803391 , -0.43982115, -0.3863792 ,
       -0.35221553,  0.25182706, -0.0345569 , -0.12098335, -0.5997773 ,
       -0.06665862, -0.21589656,  0.3067192 , -0.28151193,  0.28

In [27]:
len(model.wv["man"])

100

In [30]:
# 저장
from gensim.models import KeyedVectors

model.wv.save_word2vec_format("../../data/nlp/w2v/eng_w2v")

loaded_model = KeyedVectors.load_word2vec_format("../../data/nlp/w2v/eng_w2v")

## FastText

In [47]:
loaded_model.most_similar("memory")

[('cognition', 0.6716188192367554),
 ('implanted', 0.6652306318283081),
 ('plasticity', 0.6529497504234314),
 ('locomotion', 0.6473685503005981),
 ('cpu', 0.6427909731864929),
 ('brain', 0.6423183679580688),
 ('appetite', 0.6408774852752686),
 ('willpower', 0.640343189239502),
 ('wiring', 0.6349008679389954),
 ('simulator', 0.633720338344574)]

In [48]:
loaded_model.most_similar("memorrry")  # 오타가 난다면?
# w2v 는 dict 기반이기 때문에 없는 단어가 나오면 오류 발생

KeyError: "Key 'memorrry' not present in vocabulary"

In [45]:
from gensim.models import FastText

fasttext_model = FastText(
    sentences=result, vector_size=100, window=5, min_count=5, workers=4, sg=1
)

In [49]:
fasttext_model.wv.most_similar("memorrry")

[('memo', 0.8510879874229431),
 ('memoir', 0.8043996691703796),
 ('forgery', 0.7965269684791565),
 ('memorize', 0.7776286602020264),
 ('memory', 0.7707096934318542),
 ('nemo', 0.7623752355575562),
 ('memoirs', 0.7604741454124451),
 ('emory', 0.757304310798645),
 ('rehearsal', 0.7433197498321533),
 ('tattoo', 0.7401583194732666)]