In [203]:
# A->a B->?, Let's guess '?'

import torch
import torch.nn as nn
from torchsummary import summary

In [204]:
with open('../../dataset/NLP/tokens_wiki.txt', encoding='UTF-8') as f:
    token = f.read()
    
token = token.split()
print(token[:10])

['propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing']


In [205]:
from collections import Counter

# 시퀀스 만들기
def generate_sorted_words(tokens):
    counter = Counter(tokens)
    l = []
    for (word, cnt) in counter.most_common():
        l.append(word)
    return l

def generate_word2code(sorted_words): 
    d = {}
    for idx, word in enumerate(sorted_words):
        d[word] = idx
    return d

def convert_tokens_to_codes(tokens, word2code):
    return [word2code[word] for word in tokens]
    

# 시퀀스 기반으로 word-context 만들기
def generate_word_by_context(
    codes,                    # 시퀀스 (정수)
    max_vocab_words = 1000,   # 중심어 가능 코드 (정수코드 최댓값, 행)
    max_context_words = 1000, # 스캔 가능 코드 (정수코드 최댓값, 열)
    context_size = 2,         # 좌우 단어 개수
    weight_by_distance = True # 거리 고려 유무
):
    context = [[0 for _ in range(max_context_words)] for _ in range(max_vocab_words)]
    for idx, number in enumerate(codes):
        if number >= max_vocab_words:
            continue
        left = max(0, idx-context_size)
        right = min(len(codes)-1, idx+context_size)
        for i in range(left, right+1):
            if i == idx:
                continue
            if codes[i] >= max_context_words:
                continue
            if weight_by_distance:
                context[number][codes[i]] += 1 / abs(idx-i)
            else:
                context[number][codes[i]] += 1
    return context

In [311]:
sorted_word = generate_sorted_words(token)
word2code = generate_word2code(sorted_word)
codes = convert_tokens_to_codes(token, word2code)
context = generate_word_by_context(codes, max_vocab_words=10000, max_context_words=1000, context_size=4, weight_by_distance=True)

for i in range(10):
    for j in range(10):
        print(context[i][j], end=' ')
    print('')
    
# print(token[:10])
# print(codes[:10])

86321.49999993434 140434.916666573 146811.91666662702 218371.91666685182 73127.83333334778 104576.24999995445 68207.66666668332 17028.58333333861 29950.083333325125 24884.499999997257 
140434.916666573 97901.66666659727 34584.74999998814 50216.9166666931 112055.83333327656 51930.3333333527 27496.83333332475 38234.999999998705 18023.25000000103 23744.999999997235 
146811.91666662702 34584.74999998814 1513.9999999999886 40901.50000000441 25339.333333326445 51535.66666667933 18721.083333334413 23911.74999999613 16202.08333333642 10781.083333333627 
218371.91666685182 50216.9166666931 40901.50000000441 17275.166666670786 28603.916666655427 20769.999999998257 12971.250000002641 35277.083333327886 9844.50000000001 8918.24999999945 
73127.83333334778 112055.83333327656 25339.333333326445 28603.916666655427 9421.166666666344 20517.166666665733 15447.750000003129 15304.500000001972 7141.916666666315 6216.41666666631 
104576.24999995445 51930.3333333527 51535.66666667933 20769.999999998257 20517

In [371]:
class AutoEncoder(nn.Module):

	def __init__(self, context_words, d):
		super(AutoEncoder, self).__init__()
		self.encoder = nn.Sequential(
			nn.Linear(context_words, int((context_words+d)/2), bias=False),
			nn.Linear(int((context_words+d)/2), d, bias=False)
		)

		self.decoder = nn.Sequential(
			nn.Linear(d, int((context_words+d)/2), bias=False),
			nn.Linear(int((context_words+d)/2), context_words, bias=False)
		)

	def forward(self, x):
		x = self.encoder(x)
		x = self.decoder(x)
		return x

In [372]:
d = 200
model = AutoEncoder(context_words=1000, d=d)
model = model.to('cuda')
summary(model, (1000,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 600]         600,000
            Linear-2                  [-1, 200]         120,000
            Linear-3                  [-1, 600]         120,000
            Linear-4                 [-1, 1000]         600,000
Total params: 1,440,000
Trainable params: 1,440,000
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 5.49
Estimated Total Size (MB): 5.52
----------------------------------------------------------------


In [373]:
epoch = 100
batch = 250
optim = torch.optim.Adam(params=model.parameters(), lr=10**-6)
loss_fn = nn.MSELoss()

In [315]:
context = torch.tensor(context)
context = nn.functional.normalize(context, dim=1)

In [374]:
from tqdm.auto import tqdm

for epoch_cnt in range(epoch):
	print_loss = 0

	for batch_cnt in range(int(10000/batch)):
		input = context[batch_cnt * batch : (batch_cnt + 1) * batch]
		input = input.to('cuda')
		output = model(input)

		loss = loss_fn(output, input)
		loss.backward()

		optim.step()
		print_loss += loss.item()
		
	print(round(print_loss, 6))

0.040363
0.040064
0.039733
0.039394
0.039055
0.038712
0.038365
0.038009
0.037643
0.037262
0.036865
0.036449
0.036013
0.035554
0.03507
0.034563
0.03403
0.033472
0.032888
0.03228
0.031647
0.030992
0.030314
0.029615
0.028898
0.028163
0.027414
0.026651
0.025878
0.025097
0.02431
0.02352
0.022729
0.021941
0.021157
0.020381
0.019617
0.018872
0.01815
0.017458
0.0168
0.016183
0.015614
0.015098
0.014638
0.014235
0.013891
0.013602
0.013371
0.013203
0.013102
0.013068
0.013102
0.01319
0.013324
0.013513
0.013757
0.014043
0.014361
0.014698
0.015049
0.015402
0.01574
0.016066
0.016383
0.016678
0.016934
0.017149
0.01731
0.017399
0.017415
0.017367
0.017269
0.017122
0.016924
0.016677
0.016386
0.016059
0.015715
0.015367
0.015026
0.014704
0.014402
0.01412
0.013858
0.013618
0.013401
0.013215
0.013067
0.012963
0.012899
0.012871
0.012872
0.012898
0.012945
0.01301
0.013092
0.01319
0.013304
0.013433


In [379]:
# france : paris = germanry : ?

word1 = "beijing"
word2 = "china"
word3 = "ottawa"

code_word1 = word2code[word1]
code_word2 = word2code[word2]
code_word3 = word2code[word3]

context_word1 = nn.functional.normalize(context[code_word1].to('cuda').unsqueeze(0))
context_word2 = nn.functional.normalize(context[code_word2].to('cuda').unsqueeze(0))
context_word3 = nn.functional.normalize(context[code_word3].to('cuda').unsqueeze(0))

word4_embedded = model.encoder(context_word2) - model.encoder(context_word1) + model.encoder(context_word3)

max_cos_sim = -1
idx = 0
for i in range(10000):
	data = context[i]
	cos_sim = nn.functional.cosine_similarity(word4_embedded, model.encoder(data.to('cuda')).unsqueeze(0))
	if cos_sim>max_cos_sim:
		max_cos_sim=cos_sim
		idx = i
print(max_cos_sim)
for item in word2code:
	if word2code[item] == idx:
		print(f"{word1} -> {word2} as {word3} -> {item}")
		break

tensor([0.9978], device='cuda:0', grad_fn=<SumBackward1>)
beijing -> china as ottawa -> minnesota


In [353]:
import torch

# 입력 데이터 전처리
with open('../../dataset/NLP/questions-words.txt') as f:
	txt = f.read()
txt = txt.split('\n')
txt = [s.lower().split() for s in txt if not s.startswith(':')]

input_words = []
for n in range(len(txt)):
    word1, word2, word3, word4 = txt[n]
    if word1 not in word2code or word2 not in word2code or word3 not in word2code or word4 not in word2code:
        continue
    code_word1 = word2code[word1]
    code_word2 = word2code[word2]
    code_word3 = word2code[word3]
    if code_word1 >= 10000 or code_word2 >= 10000 or code_word3 >= 10000:
        continue
    input_words.append((code_word1, code_word2, code_word3, word4))

# GPU로 모델 이동
model.to('cuda')

# 입력 데이터를 GPU로 이동
input_data = [(context[code_word1].to('cuda'), context[code_word2].to('cuda'), context[code_word3].to('cuda'), word4) for code_word1, code_word2, code_word3, word4 in input_words]

# 모든 데이터를 한 번에 모아서 모델에 입력
input_word2_embedded = torch.stack([model.encoder(context_word2) for context_word1, context_word2, context_word3, word4 in input_data])
input_word1_embedded = torch.stack([model.encoder(context_word1) for context_word1, context_word2, context_word3, word4 in input_data])
input_word3_embedded = torch.stack([model.encoder(context_word3) for context_word1, context_word2, context_word3, word4 in input_data])

# 유사도 계산을 위한 데이터를 GPU로 이동
context_embeddings = context.to('cuda')

# 모델 평가
correct = 0
for idx, (input_word1, input_word2, input_word3, word4) in enumerate(input_data):
    word4_embedded = input_word2_embedded[idx] - input_word1_embedded[idx] + input_word3_embedded[idx]

    # 코사인 유사도 계산
    cos_sim = torch.nn.functional.cosine_similarity(word4_embedded.unsqueeze(0), model.encoder(context_embeddings))
    max_cos_sim, max_idx = torch.max(cos_sim, dim=0)

    word4_idx = max_idx.item()
    ans = ''
    for item, code in word2code.items():
        if code == word4_idx:
            ans = item
            break
    #print(word4.lower(), ans)
    if word4.lower() == ans:
        correct += 1

print(correct / len(input_data) * 100)
print(correct)

2.8172621334357792
220
