케라스로 Word2Vec 구현

1. CBOW

  - 데이터 전처리

In [2]:
# 데이터 로드
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, 
                             remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print(len(documents))

11314


In [3]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def clean_text(d):
    pattern = r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', d)
    return text

def clean_stopword(d):
    stop_words = stopwords.words('english')
    return ' '.join([w.lower() for w in d.split() if w not in stop_words and len(w) > 3])

def tokenize(d):
    return word_tokenize(d)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bitcamp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bitcamp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# 판다스 데이터프레임으로 변환
import pandas as pd
news_df = pd.DataFrame({'article' : documents})

news_df.replace("", float("NaN"), inplace=True)
news_df.dropna(inplace=True)
len(news_df)

11096

In [5]:
# 정규표현식 적용
news_df['article'] = news_df['article'].apply(clean_text)

# 불용어 제거
news_df['article'] = news_df['article'].apply(clean_stopword)

# 토크나이즈
tokenized_news = news_df['article'].apply(tokenize)
tokenized_news = tokenized_news.to_list()

In [6]:
import numpy as np

drop_news = [index for index, sentence in enumerate(tokenized_news) if len(sentence) <= 1]
news_texts = np.delete(tokenized_news, drop_news, axis=0)
len(news_texts)

  return array(a, dtype, copy=False, order=order)


10945

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

news_2000 = news_texts[:2000]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(news_2000)

idx2word = {value:key for key, value in tokenizer.word_index.items()}
sequences = tokenizer.texts_to_sequences(news_2000)

In [11]:
vocab_size = len(tokenizer.word_index) + 1

print(vocab_size)
print(sequences[1])

29769
[1263, 457, 2, 60, 119, 419, 61, 1374, 22, 69, 3498, 397, 6874, 412, 1173, 373, 2256, 458, 59, 12478, 458, 1900, 3850, 397, 22, 10, 4325, 8749, 177, 303, 136, 154, 664, 12479, 316, 12480, 15, 12481, 4, 790, 12482, 12483, 4917, 8750]


2. Skip-gram

   - 네거티브 샘플링(Negative Sampling)
     - Word2Vec은 출력층이 내놓는 값에 소프트맥스 함수를 적용해 학률값으로 변환한 후 이를 정답과 비교해 역전파(Backpropagation)
     - 소프트맥스를 적용하려면 분모에 해당하는 값, 즉 중심단어와 나머지 모든 단어의 내적을 한 뒤 이를 다시 exp 계산을 하는데 전체 단어가 많을 경우 엄청난 계산량 발생
     - 네거티브 샘플링은 소프트맥스 확률을 구할 때 전체 단어를 대상으로 구하지 않고, 일부 단어만 뽑아서 계산을 하는 방식
     - 네거티브 샘플링 동작은 사용자가 지정한 윈도우 사이즈 내에 등장하지 않는 단어(negative sample)를 5~20개 정도 뽑고, 이를 정답 단어와 합쳐 전체 단어처럼 소프트맥스 확률을 계산하여 파라미터 업데이트

In [12]:
# 샘플링
from tensorflow.keras.preprocessing.sequence import skipgrams

skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in sequences[:10]]

 
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print("{:s}({:d}), {:s}({:d}) -> {:d}".format(
        idx2word[pairs[i][0]], pairs[i][0],
        idx2word[pairs[i][1]], pairs[i][1],
        labels[i]
    ))
    
print(len(skip_grams))
print(len(pairs))
print(len(labels))

think(7), atrocities(4915) -> 1
ruin(12474), devilschicagoblackhawks(27523) -> 0
commited(4916), soldiers(864) -> 1
existance(2744), least(114) -> 1
report(627), what(34) -> 1
10
2420
2420


In [13]:
skip_grams = [skipgrams(seq, vocabulary_size=vocab_size, window_size=10) for seq in sequences]

  - Skipgram 모델 구성

In [14]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, Reshape, Activation, Input, Dot
from tensorflow.keras.utils import plot_model

embed_size = 50

# 모델 생성
def word2vec():
    target_inputs = Input(shape=(1, ), dtype='int32')
    target_embedding = Embedding(vocab_size, embed_size)(target_inputs)
    
    context_inputs = Input(shape=(1, ), dtype='int32')
    context_embedding = Embedding(vocab_size, embed_size)(context_inputs)
    
    dot_product = Dot(axes=2)([target_embedding, context_embedding])
    dot_product = Reshape((1,), input_shape=(1, 1))(dot_product) 
    output = Activation('sigmoid')(dot_product)
    
    model = Model(inputs=[target_inputs, context_inputs], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [15]:
model = word2vec()
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 50)        1488450     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 50)        1488450     ['input_2[0][0]']                
                                                                                              

In [17]:
# 모델 훈련
for epoch in range(1, 11):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X, Y)
        
    print('Epoch : ', epoch, "Loss : ", loss)

Epoch :  1 Loss :  1224.5839930176735
Epoch :  2 Loss :  923.0617003887892
Epoch :  3 Loss :  826.4403100013733
Epoch :  4 Loss :  781.8529791980982
Epoch :  5 Loss :  756.450382605195
Epoch :  6 Loss :  737.7231607586145
Epoch :  7 Loss :  719.2551698684692
Epoch :  8 Loss :  697.2652211561799
Epoch :  9 Loss :  670.0547076575458
Epoch :  10 Loss :  637.1437278930098


In [16]:
import gensim

f = open('skipgram.txt', 'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

skipgrams = gensim.models.KeyedVectors.load_word2vec_format('skipgram.txt', binary=False)

In [17]:
skipgrams.most_similar(positive=['soldier'])

[('rocketship', 0.557788074016571),
 ('apparent', 0.5494934320449829),
 ('meaxtfcxcxtexcfxcbnxxixte', 0.538479745388031),
 ('myriads', 0.5340452790260315),
 ('grider', 0.5026323795318604),
 ('eaten', 0.5016666054725647),
 ('laomer', 0.492244154214859),
 ('missouri', 0.4906013309955597),
 ('homosexuals', 0.4789530634880066),
 ('mussinas', 0.47832080721855164)]

In [18]:
skipgrams.most_similar(positive=['word'])

[('xtry', 0.5104212760925293),
 ('edelweiss', 0.4897487461566925),
 ('athens', 0.48410314321517944),
 ('appears', 0.4814095199108124),
 ('wordings', 0.4810832142829895),
 ('breakfast', 0.47731584310531616),
 ('jesuss', 0.47518065571784973),
 ('oftentimes', 0.47448301315307617),
 ('abhor', 0.47324761748313904),
 ('jointly', 0.4714374542236328)]

CBOW

  - CBOW 전처리

In [24]:
def skipgram2cbow(skipgrams):
    cbows = []
    flag = 0
    for n in skipgrams:
        temp1 = []
        for t in n:
            if flag == 1:
                flag = 0
                temp1.append(t)
            else:
                flag = 1
                temp2 = []
                for x in t:
                    temp2.append([x[1], x[0]])
                temp1.append(temp2)
        cbows.append(temp1)
    return cbows
    

In [20]:
cbows = skipgram2cbow(skip_grams)

In [21]:
pairs, labels = cbows[0][0], cbows[0][1]
for i in range(5):
    print("{:s}({:d}), {:s}({:d}) -> {:d}".format(
        idx2word[pairs[i][0]], pairs[i][0],
        idx2word[pairs[i][1]], pairs[i][1],
        labels[i]))

print(len(cbows))
print(len(pairs))
print(len(labels))

walkertiscom(17317), realize(966) -> 0
avengers(7744), what(34) -> 0
mlohfvcpj(26153), after(346) -> 0
proisraeli(12476), rediculous(12475) -> 1
treating(3849), received(387) -> 1
2000
2420
2420


 - CBOW 모델 학습

In [22]:
model = word2vec()
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 50)        1488450     ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 1, 50)        1488450     ['input_4[0][0]']                
                                                                                            

In [23]:
# 모델 훈련
for epoch in range(1, 10):
    loss = 0
    for _, elem in enumerate(cbows):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X, Y)
        
    print('Epoch : ', epoch, "Loss : ", loss)

Epoch :  1 Loss :  1226.4911558181047
Epoch :  2 Loss :  924.3692298233509
Epoch :  3 Loss :  826.8954248577356
Epoch :  4 Loss :  781.359145000577
Epoch :  5 Loss :  754.5315544530749
Epoch :  6 Loss :  733.9005327746272
Epoch :  7 Loss :  713.3908961378038
Epoch :  8 Loss :  689.6724891737103
Epoch :  9 Loss :  661.4013098366559


In [None]:
import gensim

f = open('cbow.txt', 'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

skipgrams = gensim.models.KeyedVectors.load_word2vec_format('cbow.txt', binary=False)

In [None]:
cbows.most_similar(positive=['soldier'])

In [None]:
cbows.most_similar(positive=['word'])