<h2> Input Embedding

In [28]:
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize

In [29]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/ted_en-20160408.xml",
                           filename="ted_en-20160408.xml")

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x221860a8b50>)

In [30]:
targetXML = open('ted_en-20160408.xml', 'r', encoding='UTF8')
target_text = etree.parse(targetXML)

# xml 파일로부터 <content>와 </content> 사이의 내용만 가져온다.
parse_text = '\n'.join(target_text.xpath('//content/text()'))

# 정규 표현식의 sub 모듈을 통해 content 중간에 등장하는 (Audio), (Laughter) 등의 배경음 부분을 제거.
# 해당 코드는 괄호로 구성된 내용을 제거.
content_text = re.sub(r'\([^)]*\)', '', parse_text)

# 입력 코퍼스에 대해서 NLTK를 이용하여 문장 토큰화를 수행.
sent_text = sent_tokenize(content_text)

# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
normalized_text = []
for string in sent_text:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)

# 각 문장에 대해서 NLTK를 이용하여 단어 토큰화를 수행.
result = [word_tokenize(sentence) for sentence in normalized_text]

In [31]:
print('총 샘플의 개수 : {}'.format(len(result)))

총 샘플의 개수 : 273424


In [32]:
for line in result[:3]:
    print(line)

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']
['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']


In [33]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import numpy
from sklearn.metrics.pairwise import cosine_similarity



In [34]:
model = Word2Vec(sentences=result, vector_size=512, window=2,  min_count = 5, workers = 4,sg=1)

In [35]:
model_result = model.wv.most_similar("man")
print(model_result)

[('woman', 0.7456675171852112), ('guy', 0.6916314363479614), ('girl', 0.6650304794311523), ('lady', 0.6575959324836731), ('gentleman', 0.654638409614563), ('soldier', 0.6453593969345093), ('boy', 0.6297329664230347), ('cop', 0.611029326915741), ('dancer', 0.6102009415626526), ('rabbi', 0.6100612282752991)]


In [36]:
query = "the black cat sat on the couch and the brown dog slept on the rug"

In [37]:
embedding = [ model.wv[word] for word in word_tokenize(query)]

In [38]:
tokens = word_tokenize(query)

In [39]:
max = 0
for i in range(len(embedding)):
    for j in range(len(embedding)):
        value = cosine_similarity(numpy.array([embedding[i]]),numpy.array([embedding[j]]))
        if value[0][0] > max and tokens[i] != tokens[j]:
            max = value[0][0]
            print(max)
            print(tokens[i],tokens[j])
    

0.2759055
the black
0.2796517
the sat
0.3395251
the and
0.39774162
the rug
0.46662903
black cat
0.53861237
black brown
0.61383814
cat couch
0.72466075
cat dog
0.73215926
sat slept
0.7610631
couch rug


<h2> Positional Encoding

In [14]:
import numpy
import math
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
def positional_encoding(pos,d_model = 512):
    pe = numpy.zeros(d_model)
    for i in range(0,d_model,2):
        pe[i] = math.sin(pos / (10000**((2*i)/d_model)))
        pe[i+1] = math.cos(pos / (10000**((2*i)/d_model)))
    return pe

In [23]:
pos2 = numpy.array([positional_encoding(2)])

In [24]:
pos10 = numpy.array([positional_encoding(10)])

In [25]:
cosine_similarity(pos2,pos10)

array([[0.86000133]])

In [27]:
def PE(y,pos,d_model = 512):
    pe = numpy.zeros(d_model)
    for i in range(0,512,2):
        pe[i] = math.sin(pos/(10000 ** ((2*i) / d_model)))
        pe[i] = (y[pos][i] * math.sqrt(d_model)) + pe[i]

        pe[i+1] = math.cos(pos/(10000 ** ((2*i) / d_model)))
        pe[i+1] = (y[pos][i+1] * math.sqrt(d_model)) + pe[i+1]
        
    return pe

In [47]:
black = embedding[2]
brown = embedding[10]

In [41]:
pe_black = PE(embedding,2)

In [42]:
pe_brown = PE(embedding,10)

In [49]:
word_similarity = cosine_similarity(numpy.array([black]),numpy.array([brown]))

In [50]:
positional_similarity = cosine_similarity(pos2,pos10)

In [51]:
final_similarity = cosine_similarity(numpy.array([pe_black]),numpy.array([pe_brown]))

In [55]:
print("word_similarity: {value}".format(value=word_similarity) )
print("positional_similarity: {value}".format(value=positional_similarity) )
print("final_similarity: {value}".format(value=final_similarity) )

word_similarity: [[0.72466075]]
positional_similarity: [[0.86000133]]
final_similarity: [[0.72887171]]
