In [1]:
import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.layers import Dense, Embedding, Input, Add, Dot, Reshape, Flatten
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.sequence import skipgrams
from tensorflow.python.keras.models import Model, load_model

import tarfile
from urllib.request import urlretrieve
import os
import nltk
from scipy.sparse import csr_matrix, lil_matrix
import numpy as np
from scipy.sparse import save_npz, load_npz
from tensorflow.python.keras import backend as K
import random
import matplotlib.pyplot as plt
from tensorflow.python.keras.utils import plot_model
import pandas as pd
%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Chuẩn bị dữ liệu 

In [2]:
def read_data(filename, n_lines):
    """ Reading the zip file to extract text """
    docs = []
    i = 0
    with open(filename, 'r', encoding='utf-8') as f:
        for row in f:
            file_string = nltk.word_tokenize(row)
            # First token is the movie ID
            docs.append(' '.join(file_string[1:]))
            i += 1
            if n_lines and i == n_lines:
                break
    return docs

In [3]:
docs = read_data("MovieSummaries/plot_summaries.txt", 100)

In [4]:
docs[1]

"The nation of Panem consists of a wealthy Capitol and twelve poorer districts . As punishment for a past rebellion , each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games . The tributes must fight to the death in an arena ; the sole survivor is rewarded with fame and wealth . In her first Reaping , 12-year-old Primrose Everdeen is chosen from District 12 . Her older sister Katniss volunteers to take her place . Peeta Mellark , a baker 's son who once gave Katniss bread when she was starving , is the other District 12 tribute . Katniss and Peeta are taken to the Capitol , accompanied by their frequently drunk mentor , past victor Haymitch Abernathy . He warns them about the `` Career '' tributes who train intensively at special academies and almost always win . During a TV interview with Caesar Flickerman , Peeta unexpectedly reveals his love for Katniss . She is outraged , believing it to be a ploy to gain audience supp

In [5]:
vocab_size = 3000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='UNK')
tokenizer.fit_on_texts(docs)

In [6]:
doc_0 = tokenizer.texts_to_sequences(docs)
doc_0[0]

[1,
 5,
 621,
 327,
 1,
 1190,
 4,
 1,
 5,
 1,
 2087,
 5,
 1548,
 70,
 1549,
 197,
 4,
 234,
 32,
 1,
 1550,
 29,
 27,
 96,
 73,
 803,
 35,
 44]

In [7]:
docs[0]

"Shlykov , a hard-working taxi driver and Lyosha , a saxophonist , develop a bizarre love-hate relationship , and despite their prejudices , realize they are n't so different after all ."

In [8]:
tokenizer.sequences_to_texts(doc_0)[0]

"UNK a hard working UNK driver and UNK a UNK develop a bizarre love hate relationship and despite their UNK realize they are n't so different after all"

In [9]:
tokenizer.word_index

{'UNK': 1,
 'the': 2,
 'to': 3,
 'and': 4,
 'a': 5,
 'of': 6,
 'in': 7,
 'is': 8,
 'his': 9,
 'he': 10,
 "'s": 11,
 'her': 12,
 'with': 13,
 'that': 14,
 'for': 15,
 'him': 16,
 'by': 17,
 'on': 18,
 'as': 19,
 'she': 20,
 'who': 21,
 'but': 22,
 'at': 23,
 'from': 24,
 'it': 25,
 'an': 26,
 'are': 27,
 'has': 28,
 'they': 29,
 'when': 30,
 'up': 31,
 'their': 32,
 'into': 33,
 'out': 34,
 'after': 35,
 'then': 36,
 "''": 37,
 'be': 38,
 'not': 39,
 'kid': 40,
 'was': 41,
 'which': 42,
 'this': 43,
 'all': 44,
 'one': 45,
 'about': 46,
 'them': 47,
 'while': 48,
 'father': 49,
 '’': 50,
 'film': 51,
 'where': 52,
 'back': 53,
 'man': 54,
 'jubei': 55,
 'will': 56,
 'time': 57,
 'cao': 58,
 's': 59,
 'before': 60,
 'other': 61,
 'however': 62,
 'off': 63,
 'can': 64,
 'tells': 65,
 'life': 66,
 'two': 67,
 'does': 68,
 'through': 69,
 'love': 70,
 'had': 71,
 'get': 72,
 'so': 73,
 'girl': 74,
 'only': 75,
 'wife': 76,
 'bhagwaan': 77,
 'have': 78,
 'now': 79,
 'home': 80,
 'family': 81

# Tính ma trận đồng xuất hiện 
* lil_matrix của scipy.sparse hỗ trợ lưu trữ và tính toán trên các ma trận thưa hiệu năng cao

In [10]:
def generate_cooc_matrix(text, tokenizer, window_size, n_vocab, use_weighting=True):
    sequences = tokenizer.texts_to_sequences(text)
    
    cooc_mat = lil_matrix((n_vocab, n_vocab), dtype=np.float32)
    
    for sequence in sequences:
        ## lay tung cua so mot
        for i, wi in zip(np.arange(window_size, len(sequence)-window_size), sequence[window_size:-window_size]):
            # lay chi so cua cac cua so
            context_window = sequence[i-window_size: i+window_size+1]
            # tinh khoang cach de khoi tao trong so
            distances = np.abs(np.arange(-window_size, window_size+1))
            distances[window_size] = 1.0
            nom = np.ones(shape=(window_size*2 + 1,), dtype=np.float32)
            nom[window_size] = 0.0

            if use_weighting:
                cooc_mat[wi, context_window] += nom/distances
            else:
                cooc_mat[wi, context_window] += nom
    
    return cooc_mat

In [11]:
generate_cooc = False
window_size = 4
if generate_cooc:
    cooc_mat = generate_cooc_matrix(docs, tokenizer, window_size, vocab_size, True)
    save_npz('cooc_mat.npz', cooc_mat.tocsr())
else:
    cooc_mat = load_npz('cooc_mat.npz').tolil()
    print('Cooc matrix of type {} was loaded from disk'.format(type(cooc_mat).__name__))

Cooc matrix of type lil_matrix was loaded from disk


In [12]:
cooc_mat.shape

(3000, 3000)

In [13]:
cooc_mat.getrow(1).toarray()[0][:5]

array([  0.     , 669.749  , 874.4147 , 501.83417, 612.0003 ],
      dtype=float32)

# Định nghĩa GloVe model với Keras
Hàm mất mát của GloVe được định nghĩa như sau

$J = \sum_{i,j=1}^{V} f(X_{ij})(w_i^T\tilde{w}_j + b_i + \tilde{b}_j - log(X_{ij}))^2$

Ở đây $X_{ij}$ là phần tử  $i,j$  của ma trận đồng xuất hiện và $w,\tilde{w},b,\tilde{b}$ are weights and biases.

Đồng thời, hàm f được lựa chọn là :
$f(x) = (x/x_{max})^\alpha$ nếu $x < x_{max}$ và $f(x)=1$ ngược lại.

Tác giả lựa chọn $\alpha=3/4, x_{max}=100$

Hàm mất mát trên có dạng như sau,

$J = \sum A ( B - C)^2$

In [14]:
def glove_loss(y_true, y_pred):
    return K.sum(K.pow((y_true-1)/100.0, 0.75)*K.square(y_pred - K.log(y_true)))

In [15]:
vocab_size = 3000

In [28]:
from keras.models import Model
from keras.layers import Embedding, Input, Add, Dot
import keras

In [29]:
w_i = Input(shape=(1,))
w_j = Input(shape=(1,))
    ## size of dense vector = 96
emb_i = Embedding(input_dim=vocab_size, output_dim=96, input_length=1)(w_i)
emb_j = Embedding(input_dim=vocab_size, output_dim=96, input_length=1)(w_j)

ij_dot = Dot(axes=-1)([emb_i,emb_j])
    
b_i = Embedding(vocab_size, 1, input_length=1)(w_i)
b_j = Embedding(vocab_size, 1, input_length=1)(w_j)
    ## predict wT*w+bi+bj
pred = Add()([ij_dot, b_i, b_j])
    
model = Model(inputs=[w_i, w_j],outputs=pred)
model.compile(loss=glove_loss, optimizer=keras.optimizers.Adam(lr=0.0001))

In [30]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1, 96)        288000      input_5[0][0]                    
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 1, 96)        288000      input_6[0][0]                    
__________________________________________________________________________________________________
dot_3 (Dot

# Huấn luyện và đánh giá 

In [31]:
batch_size =16
copy_docs = list(docs)
index2word = dict(zip(tokenizer.word_index.values(), tokenizer.word_index.keys()))
""" Each epoch """
for ep in range(5):
    
    #valid_words = get_valid_words(docs, 20, tokenizer)
    ## xáo trộn lại bộ dữ liệu sau mỗi epoch
#     random.shuffle(copy_docs)
    losses = []
    """ Duyệt qua các document """
    for doc in copy_docs:
        
        seq = tokenizer.texts_to_sequences([doc])[0]

        """ Getting skip-gram data """
        # Tạo ra các cặp ngẫu nhiên để cập nhật tham số
        wpairs, labels = skipgrams(
            sequence=seq, vocabulary_size=vocab_size, window_size=6, negative_samples=0.0, shuffle=True
        )
        
        if len(wpairs)==0:
            continue
#         print("len(seq)",len(seq))
#         print(seq)
#         print("len(wpairs)", len(wpairs))
#         print(wpairs)
        sg_in, sg_out = zip(*wpairs)
        ## đầu vào
        sg_in, sg_out = np.array(sg_in).reshape(-1,1), np.array(sg_out).reshape(-1,1)
        ## nhãn
        x_ij = cooc_mat[sg_in[:,0], sg_out[:,0]].toarray().reshape(-1,1) + 1
#         print(sg_in.shape)
#         print(len(seq))
#         print(seq)
#         print(sg_in[1], sg_out[1])
#         print(labels)
#         print(cooc_mat[sg_in[:,0], sg_out[:,0]].toarray().reshape(-1,1)+1)
        model.fit([sg_in, sg_out], x_ij, batch_size = batch_size, epochs=1, verbose=0)
        loss = model.evaluate([sg_in, sg_out], x_ij, batch_size=batch_size, verbose=0)
        losses.append(loss)
    print('Loss trung bình ở epoch {}: {}'.format(ep, np.mean(losses)))

Loss trung bình ở epoch 0: 1663.1114875682701
Loss trung bình ở epoch 1: 803.1005166501362
Loss trung bình ở epoch 2: 526.8423363520943
Loss trung bình ở epoch 3: 402.9512019259093
Loss trung bình ở epoch 4: 337.03478341915286


# Lấy embedding weights

In [32]:
model.save("glove_weights.h5")

In [38]:
embedding_weight = None
for layer in model.layers:
    if layer.name == 'embedding_9':
        embedding_weight = layer.get_weights()[0]
    if layer.name == 'embedding_10':
        embedding_weight = embedding_weight + layer.get_weights()[0]
        break
print(embedding_weight)

[[-0.04734902  0.02360551 -0.03272984 ... -0.01553813 -0.00681086
   0.03427841]
 [ 0.5238979  -0.29287356  0.32245368 ... -0.467974   -0.09829116
   0.2598217 ]
 [ 0.47304267 -0.35983425  0.3644105  ... -0.42344904 -0.16215557
   0.37350252]
 ...
 [ 0.13633487 -0.24083564  0.18599996 ... -0.204664   -0.13266142
   0.19938882]
 [ 0.09343953 -0.22414808  0.22733359 ... -0.25340846 -0.17432213
   0.17909451]
 [ 0.24181968 -0.14836887  0.24850723 ... -0.17887104 -0.18795313
   0.236525  ]]


In [39]:
embedding_weight = np.array(embedding_weight)
## 3000 is vocab's size and 96 is embedding's size 
embedding_weight.shape

(3000, 96)

# Tính embedding của từ his

In [42]:
tokenizer.word_index

{'UNK': 1,
 'the': 2,
 'to': 3,
 'and': 4,
 'a': 5,
 'of': 6,
 'in': 7,
 'is': 8,
 'his': 9,
 'he': 10,
 "'s": 11,
 'her': 12,
 'with': 13,
 'that': 14,
 'for': 15,
 'him': 16,
 'by': 17,
 'on': 18,
 'as': 19,
 'she': 20,
 'who': 21,
 'but': 22,
 'at': 23,
 'from': 24,
 'it': 25,
 'an': 26,
 'are': 27,
 'has': 28,
 'they': 29,
 'when': 30,
 'up': 31,
 'their': 32,
 'into': 33,
 'out': 34,
 'after': 35,
 'then': 36,
 "''": 37,
 'be': 38,
 'not': 39,
 'kid': 40,
 'was': 41,
 'which': 42,
 'this': 43,
 'all': 44,
 'one': 45,
 'about': 46,
 'them': 47,
 'while': 48,
 'father': 49,
 '’': 50,
 'film': 51,
 'where': 52,
 'back': 53,
 'man': 54,
 'jubei': 55,
 'will': 56,
 'time': 57,
 'cao': 58,
 's': 59,
 'before': 60,
 'other': 61,
 'however': 62,
 'off': 63,
 'can': 64,
 'tells': 65,
 'life': 66,
 'two': 67,
 'does': 68,
 'through': 69,
 'love': 70,
 'had': 71,
 'get': 72,
 'so': 73,
 'girl': 74,
 'only': 75,
 'wife': 76,
 'bhagwaan': 77,
 'have': 78,
 'now': 79,
 'home': 80,
 'family': 81

In [43]:
embedding_weight[tokenizer.word_index['his']-1]

array([ 0.56928164, -0.49628985,  0.48866215, -0.52556205,  0.4171348 ,
       -0.56177014,  0.4760769 ,  0.5888324 , -0.49591023,  0.58702576,
       -0.53860706,  0.5598077 ,  0.34337765, -0.51072955, -0.32305774,
       -0.4731152 ,  0.55655754,  0.3424262 ,  0.5840586 ,  0.49474412,
       -0.5133854 ,  0.3983161 , -0.40498334, -0.5340134 ,  0.5653129 ,
       -0.6197592 , -0.47593993, -0.5426516 ,  0.4645278 , -0.293815  ,
       -0.4483766 , -0.4602381 ,  0.62275493, -0.54747045, -0.4960499 ,
        0.520857  , -0.51416737,  0.4158871 , -0.37949127, -0.36097464,
        0.4237674 , -0.55578005, -0.57555294, -0.51318556, -0.44903332,
        0.5150433 ,  0.35691977, -0.45317954, -0.2826779 ,  0.3498969 ,
       -0.6165937 , -0.5322361 ,  0.51673627, -0.3059477 , -0.26244393,
       -0.35902175, -0.52702636, -0.5049096 ,  0.50229585,  0.47296536,
       -0.60040504,  0.53765404,  0.57628286, -0.37819743, -0.24559835,
       -0.6733581 , -0.48287928,  0.3815036 ,  0.6196392 , -0.55

# Tính toán khoảng cách của các word embedding 

In [44]:
from scipy.spatial import distance
distance.euclidean(embedding_weight[tokenizer.word_index['his']-1], embedding_weight[tokenizer.word_index['he']-1])

0.6767275333404541

In [51]:
distance.euclidean(embedding_weight[tokenizer.word_index['his']-1], embedding_weight[tokenizer.word_index['like']-1])

1.4823721647262573

In [49]:
distance.euclidean(embedding_weight[tokenizer.word_index['small']-1], embedding_weight[tokenizer.word_index['big']-1])

1.1149847507476807

In [50]:
distance.euclidean(embedding_weight[tokenizer.word_index['small']-1], embedding_weight[tokenizer.word_index['little']-1])

0.6185420155525208