In [44]:
import numpy as np
from tqdm import tqdm_notebook
import re

# Chuẩn bị dữ liệu 

In [45]:
def get_word_piece(word, n_gram=3):
    word_pieces = []
    word = "<"+word+">"
    for index, w in enumerate(word):
        if index<=len(word)-n_gram:
            word_pieces.append(word[index:index+n_gram])
    return word_pieces

def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    word_tokens = pattern.findall(text.lower())
    word_piece_tokens = []
    for word_token in word_tokens:
        word_piece_token = get_word_piece(word_token)
        for piece in word_piece_token:
            word_piece_tokens.append(piece)
    return word_piece_tokens


def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()

    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token

    return word_to_id, id_to_word

def generate_training_data(tokens, word_to_id, window_size):
    N = len(tokens)
    X, Y = [], []

    for i in range(N):
        nbr_inds = list(range(max(0, i - window_size), i)) + \
                   list(range(i + 1, min(N, i + window_size + 1)))
        for j in nbr_inds:
            X.append(word_to_id[tokens[i]])
            Y.append(word_to_id[tokens[j]])
            
    X = np.array(X)
#     X = np.expand_dims(X, axis=0)
    Y = np.array(Y)
#     Y = np.expand_dims(Y, axis=0)
            
    return X, Y

In [69]:
doc = "After the deduction of the costs of his investing, " \
      "Cosin similarity beating the stock market is a loser's game."
tokens = tokenize(doc)
tokens

['<af',
 'aft',
 'fte',
 'ter',
 'er>',
 '<th',
 'the',
 'he>',
 '<de',
 'ded',
 'edu',
 'duc',
 'uct',
 'cti',
 'tio',
 'ion',
 'on>',
 '<of',
 'of>',
 '<th',
 'the',
 'he>',
 '<co',
 'cos',
 'ost',
 'sts',
 'ts>',
 '<of',
 'of>',
 '<hi',
 'his',
 'is>',
 '<in',
 'inv',
 'nve',
 'ves',
 'est',
 'sti',
 'tin',
 'ing',
 'ng>',
 '<co',
 'cos',
 'osi',
 'sin',
 'in>',
 '<si',
 'sim',
 'imi',
 'mil',
 'ila',
 'lar',
 'ari',
 'rit',
 'ity',
 'ty>',
 '<be',
 'bea',
 'eat',
 'ati',
 'tin',
 'ing',
 'ng>',
 '<th',
 'the',
 'he>',
 '<st',
 'sto',
 'toc',
 'ock',
 'ck>',
 '<ma',
 'mar',
 'ark',
 'rke',
 'ket',
 'et>',
 '<is',
 'is>',
 '<a>',
 '<lo',
 'los',
 'ose',
 'ser',
 "er'",
 "r's",
 "'s>",
 '<ga',
 'gam',
 'ame',
 'me>']

In [70]:
get_word_piece("after")

['<af', 'aft', 'fte', 'ter', 'er>']

In [71]:
word_to_id, id_to_word = mapping(tokens)
word_to_id

{'ves': 0,
 '<in': 1,
 'ame': 2,
 'ost': 3,
 'est': 4,
 'inv': 5,
 '<th': 6,
 'ing': 7,
 'of>': 8,
 'mil': 9,
 'tin': 10,
 'lar': 11,
 'rit': 12,
 'his': 13,
 'the': 14,
 'ty>': 15,
 'toc': 16,
 "'s>": 17,
 'me>': 18,
 'sts': 19,
 '<hi': 20,
 'gam': 21,
 'duc': 22,
 'cti': 23,
 'on>': 24,
 'mar': 25,
 'nve': 26,
 '<ga': 27,
 'ark': 28,
 'ck>': 29,
 'edu': 30,
 'ati': 31,
 '<is': 32,
 'in>': 33,
 '<st': 34,
 '<co': 35,
 'tio': 36,
 '<ma': 37,
 'sim': 38,
 '<lo': 39,
 'is>': 40,
 'ose': 41,
 'er>': 42,
 'ter': 43,
 'ng>': 44,
 'osi': 45,
 '<si': 46,
 'eat': 47,
 'los': 48,
 "r's": 49,
 '<a>': 50,
 'aft': 51,
 '<of': 52,
 'sin': 53,
 'sti': 54,
 'rke': 55,
 "er'": 56,
 '<af': 57,
 'ity': 58,
 'ts>': 59,
 'ock': 60,
 'ded': 61,
 'uct': 62,
 'ket': 63,
 'et>': 64,
 'cos': 65,
 'fte': 66,
 'sto': 67,
 'he>': 68,
 'ion': 69,
 'imi': 70,
 '<be': 71,
 'bea': 72,
 '<de': 73,
 'ser': 74,
 'ari': 75,
 'ila': 76}

In [72]:
X, Y = generate_training_data(tokens, word_to_id, 3)
X.shape

(534,)

In [73]:
Y.shape

(534,)

In [74]:
for i in range(10):
    print(id_to_word[X[i]],"-->", id_to_word[Y[i]])

<af --> aft
<af --> fte
<af --> ter
aft --> <af
aft --> fte
aft --> ter
aft --> er>
fte --> <af
fte --> aft
fte --> ter


In [75]:
X[:5]

array([57, 57, 57, 51, 51])

In [76]:
Y[:5]

array([51, 66, 43, 57, 66])

In [77]:
from keras.utils import to_categorical
X_train = to_categorical(X)

In [78]:
X_train[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [79]:
Y_train = to_categorical(Y)
Y_train[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [80]:
print(X_train.shape, Y_train.shape)

(534, 77) (534, 77)


# Huấn luyện 

In [81]:
from keras.models import Sequential
from keras.layers import Dense, Activation
import keras

In [82]:
size_of_vocab = X_train.shape[1]
size_of_vocab

77

In [83]:
emb_size = 112

In [84]:
model = Sequential()
model.add(Dense(emb_size, activation='linear', input_dim=size_of_vocab ))
model.add(Dense(size_of_vocab, activation='linear'))
model.add(Activation("softmax"))

In [85]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 112)               8736      
_________________________________________________________________
dense_6 (Dense)              (None, 77)                8701      
_________________________________________________________________
activation_3 (Activation)    (None, 77)                0         
Total params: 17,437
Trainable params: 17,437
Non-trainable params: 0
_________________________________________________________________


In [86]:
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=0.001), metrics=['accuracy'])

In [87]:
model.fit(X_train, Y_train, epochs=100, batch_size=10)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f25082b7fd0>

# Get trained word embedding weights

In [88]:
Word_embeding = model.get_weights()[0]

In [89]:
Word_embeding

array([[ 0.29162848, -0.07783902, -0.2280984 , ...,  0.5298977 ,
        -0.44802997, -0.70827174],
       [-0.01633759, -0.3140511 , -0.28413194, ..., -0.19401434,
        -0.35291213, -0.07612595],
       [ 0.23913978, -0.505189  ,  0.23383209, ..., -0.12183629,
         0.14598939,  0.0737638 ],
       ...,
       [-0.409593  , -0.24502864,  0.11214375, ..., -0.01677183,
         0.26012078, -0.32655013],
       [ 0.271678  ,  0.40443122, -0.03090577, ...,  0.36683407,
        -0.42701516,  0.5837345 ],
       [ 0.18708327,  0.1078411 , -0.2054083 , ...,  0.15078776,
        -0.14029126,  0.51986206]], dtype=float32)

# Testing get embedding of the word "after"

In [90]:
Word_embeding.shape

(77, 112)

In [91]:
embbeding = None
word_pieces = get_word_piece("after", n_gram=3)
for piece in word_pieces:
    piece_id = word_to_id[piece]
    if embbeding is None:
        embbeding = Word_embeding[piece_id]
    else:
        embbeding += Word_embeding[piece_id]
embbeding = embbeding/len(word_pieces)
print(embbeding)

[-0.01444846 -0.431602   -0.00264669  0.17870767  0.3952416   0.27600938
  0.14046016 -0.36320776  0.19632185 -0.16066098  0.01900725 -0.1814755
 -0.12838139 -0.22161512 -0.00749634  0.26762637 -0.20738968 -0.085875
 -0.0401297   0.2765072  -0.2681163   0.28341728  0.15625647 -0.36464217
  0.1364804  -0.29135412 -0.36955887 -0.03564329  0.33625397  0.1507515
  0.00240508 -0.05599285 -0.00340888 -0.323869    0.3481112  -0.14642613
 -0.11769992  0.312607   -0.17820561  0.05731595  0.24385338  0.41541559
 -0.23995419  0.520674    0.04787733  0.05834616 -0.11685461 -0.24486664
 -0.1847879   0.34670573 -0.02185403  0.3039195   0.22808722  0.31143373
 -0.20058432  0.29002178 -0.13666964 -0.5770857  -0.37277004  0.3246476
  0.00832421 -0.4108111   0.5126812  -0.16593562  0.18165083 -0.4060672
  0.2746221  -0.3246849   0.17303386  0.20828936  0.030694   -0.32349133
  0.17342229  0.362675    0.01897696 -0.34025058 -0.10872617  0.22097048
  0.1455799   0.07556392  0.6033536  -0.27384478 -0.20572

# Testing get embedding of word "losing" not in the dictionary but it's word piece existed

In [98]:
embbeding = None
word_pieces = get_word_piece("losing", n_gram=3)
print(word_pieces)
for word_piece in word_pieces:
    print(word_piece, word_to_id[word_piece])
for piece in word_pieces:
    piece_id = word_to_id[piece]
    if embbeding is None:
        embbeding = Word_embeding[piece_id]
    else:
        embbeding += Word_embeding[piece_id]
embbeding = embbeding/len(word_pieces)
print(embbeding)

['<lo', 'los', 'osi', 'sin', 'ing', 'ng>']
<lo 39
los 48
osi 45
sin 53
ing 7
ng> 44
[ 0.5233029  -0.18932785  0.01923718 -0.6361206   0.2346127   0.37669668
 -0.7926662  -0.1076906   0.3485751   0.7007458   1.0222338  -0.981503
 -0.29068643  0.43090343 -0.85105735  0.5052257  -0.5531355   0.23221801
  0.831013    0.18873595  0.1506164   0.28947803  0.5842615   0.5527968
 -0.18634443  1.1940554  -0.12421032  0.3061869   0.02212839  0.09907737
  0.8860903  -0.89186317  0.52304065  0.7043198  -0.21277428 -0.13694693
  0.6182614   0.0039833   0.40062726 -1.059151    0.44069275 -0.20310889
  0.06289941 -0.7391157  -0.6868355  -0.62666565 -0.7596021   0.56353647
  0.53589016 -0.40051767  0.91064864  0.5102177   0.00672227  0.65918183
  1.21952    -0.94920826 -0.9105738   0.75623727  0.01226761  1.3328598
  1.2092388  -0.43106627 -0.31395558 -0.18870063 -0.04480935  0.26839593
 -0.61750036  0.85443044 -0.00763851  1.1012146  -0.61211395  0.59797055
  0.41346368 -0.16219763 -0.47783998 -0.2397