### Word2Vec

In [1]:
import numpy as np

c = np.array([[1, 0, 0, 0, 0, 0, 0]]) # one-hot encoding
W = np.random.randn(7, 3) # random한 weights 구함
print(W)
h = np.matmul(c, W) # W_in과 c(input word)를 matmul 시켜서 행뽑기를 하는 것임
print(h)

[[-1.90066591  1.63807606 -0.84538405]
 [ 1.30965265 -0.63327602  1.44838936]
 [-0.29903844 -0.38673405 -0.84598899]
 [ 0.29202567  0.83916396 -0.98961364]
 [ 0.1370709   2.31409889 -0.72664352]
 [-0.6505606   1.37691074  1.17466226]
 [-0.54524173 -0.06539849  1.81200707]]
[[-1.90066591  1.63807606 -0.84538405]]


In [4]:
# common.layer(만든 함수)에 Matmul 함수를 넣어서 똑같은 결과를 봅시다
import sys
sys.path.append('..')
import numpy as np
from common.layers import Matmul

c = np.array([[1, 0, 0, 0, 0, 0, 0]])
W = np.random.randn(7, 3)
print(W)
layer = Matmul(W)
h = layer.forward(c) # weight와 연산 할 x를 넣어주는 것임
print(h)

[[ 1.42292931  1.51499091 -0.1182692 ]
 [ 0.29217769  0.48745789  0.50117567]
 [-1.49255114 -0.38795633  0.39407587]
 [-1.77329012  0.98185021 -0.15736354]
 [ 0.39865025  0.54464587 -1.1002162 ]
 [-0.76040161 -0.84158808 -1.02621694]
 [-0.83752641  1.68234119 -0.72969668]]
[[ 1.42292931  1.51499091 -0.1182692 ]]


### CBOW

In [7]:
import sys
sys.path.append('..')
import numpy as np
from common.layers import Matmul

# 샘플 context 데이터
c0 = np.array([[1, 0, 0, 0, 0, 0, 0]])
c1 = np.array([[0, 1, 0, 0, 0, 0, 0]])

# 가중치 초기화
# 두개의 weight는 tranfer된 상태의 행렬이 아님!!
W_in = np.random.randn(7, 3) 
W_out = np.random.randn(3, 7)

# 계층 생성
in_layer0 = Matmul(W_in) # input(context)가 두개가 들어오기 때문
in_layer1 = Matmul(W_in)
out_layer = Matmul(W_out)

# forward
h0 = in_layer0.forward(c0)
h1 = in_layer1.forward(c1)
h = 0.5 * (h0 + h1) # contexts가 모여서 평균 내는 구간
s = out_layer.forward(h)

print(s)

[[-3.59836701  1.34803703 -1.44426032  2.02358586 -1.91176526 -2.58540393
  -2.1412246 ]]


In [8]:
# 어떻게 동작하는지를 알았으니, 이제 실제 text를 넣어서 학습시켜보기
import sys
sys.path.append('..')
from common.util import preprocess

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
print(corpus)

[0 1 2 3 4 1 5 6]


In [9]:
print(id_to_word)

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [11]:
def create_contexts_target(corpus, window_size=1):
    '''
    context, target 생성
    corpus: 말뭉치 (단어 ID 목록)
    window_size: 윈도우 크기
    return (context, target)의 array
    '''

    target = corpus[window_size:-window_size]
    contexts = []

    for idx in range(window_size, len(corpus)-window_size): 
        cs = []
        # window 크기만큼 타겟 단어 좌우 context 가져오기 
        # [-1, 0, 1] 이렇게 배치되어있을 때 0은 target, -1, 1은 contexts를 의미함
        for t in range(-window_size, window_size+1):
            if t != 0:
                cs.append(corpus[idx + t]) # context인 경우 cs에 추가 / idx의 의미는 해당 corpus[idx]가 target이라는 말
        contexts.append(cs)
    
    return np.array(contexts), np.array(target)

In [14]:
import sys
sys.path.append('..')
# from common.util import create_contexts_target

contexts, target = create_contexts_target(corpus, window_size=1)
print(contexts)
print(target)

[[0 2]
 [1 3]
 [2 4]
 [3 1]
 [4 5]
 [1 6]]
[1 2 3 4 1 5]


In [15]:
def convert_one_hot(corpus, vocab_size):
    '''
    원핫 표현으로 변환
    corpus: 단어 ID 목록(1차원 또는 2차원 넘파이 배열)
    vocab_size: 어휘 수
    return: 원핫 표현(2차원 또는 3차원 넘파이 배열)
    '''
    N = corpus.shape[0]
    if corpus.ndim == 1:
        one_hot = np.zeros((N, vocab_size), dtype=np.int32)
        for idx, word_id in enumerate(corpus):
            one_hot[idx, word_id] = 1

    elif corpus.ndim == 2:
        C = corpus.shape[1]
        one_hot = np.zeros((N, C, vocab_size), dtype=np.int32)
        for idx_0, word_ids in enumerate(corpus):
            for idx_1, word_id in enumerate(word_ids):
                one_hot[idx_0, idx_1, word_id] = 1

    return one_hot

In [20]:
import sys
sys.path.append('..')
# from common.util import preprocess, create_contexts_target, convert_one_hot

text = 'You say goodby and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

contexts, target = create_contexts_target(corpus, window_size=1)

vocab_size = len(word_to_id)
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)

In [21]:
print(target)

[[0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 1 0 0]
 [0 1 0 0 0 0 0]
 [0 0 0 0 0 1 0]]


In [22]:
print(contexts)

[[[1 0 0 0 0 0 0]
  [0 0 1 0 0 0 0]]

 [[0 1 0 0 0 0 0]
  [0 0 0 1 0 0 0]]

 [[0 0 1 0 0 0 0]
  [0 0 0 0 1 0 0]]

 [[0 0 0 1 0 0 0]
  [0 1 0 0 0 0 0]]

 [[0 0 0 0 1 0 0]
  [0 0 0 0 0 1 0]]

 [[0 1 0 0 0 0 0]
  [0 0 0 0 0 0 1]]]


### CBOW 모델 구현

In [24]:
import sys
sys.path.append('..')
import numpy as np
from common.layers import Matmul, SoftmaxWithLoss

class SimpleCBOW:
    def __init__(self, vocab_size, hidden_size):
        V, H = vocab_size, hidden_size

        # 가중치 초기화
        W_in = 0.01 * np.random.randn(V, H).astype('f') # astype('f'): 부동소수점 형식
        W_out = 0.01 * np.random.randn(H, V).astype('f')

        # 레이어 생성
        self.in_layer0 = Matmul(W_in)
        self.in_layer1 = Matmul(W_in)
        self.out_layer = Matmul(W_out)
        self.loss_layer = SoftmaxWithLoss()

        # 모든 가중치와 기울기를 리스트에 모으기
        layers = [self.in_layer0, self.in_layer1, self.out_layer]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        # 인스턴스 변수에 단어의 분산 표현 저장 # 무슨 용도지?
        self.word_vecs1 = W_in
        self.word_vecs2 = W_out.T

    def forward(self, contexts, target):
        h0 = self.in_layer0.forward(contexts)
        h1 = self.in_layer1.forward(contexts)
        h = 0.5 * (h0 + h1)
        print("h", h.shape)
        score = self.out_layer.forward(h)
        print(score.shape, target.shape)
        loss = self.loss_layer.forward(score, target)
        return loss
    
    def backward(self, dout = 1):
        ds = self.loss_layer.backward(dout)
        da = self.out_layer.backward(ds)
        da *= 0.5
        self.in_layer1.backward(da)
        self.in_layer0.backward(da)
        return None

In [29]:
# import sys
# sys.path.append('..')
from common.trainer import Trainer
from common.optimization import Adam
# from simple_cbow import SimpleCBOW
from common.util import preprocess, create_contexts_target, convert_one_hot

window_size = 1
hidden_size = 5
batch_size = 3
max_epoch = 1000

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

vocab_size = len(word_to_id)
# cbow 학습 데이터셋 생성
contexts, target = create_contexts_target(corpus, window_size)
# Input에 맞는 one-hot 표현 변환
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)
# contexts.shape -> (6,2,7) context 벡터 크기 (6,7). target 주변의 context 벡터 2개, 
print(contexts.shape, target.shape)
# 모델 초기화
model = SimpleCBOW(vocab_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

(6, 2, 7) (6, 7)


In [30]:
trainer.fit(contexts, target, max_epoch, batch_size)

h (3, 2, 5)
(3, 2, 7) (3, 7)


IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (3,) (3,7) 