## Word Embeddings with Neural Networks

In [9]:
import re
import nltk
import emoji
import numpy as np
from nltk.tokenize import word_tokenize

In [10]:
def get_dict(data):
    """
    Input: data a list of word indices
    Output: word dict
    """
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    
    word2ind = {}
    ind2word = {}
    for k in words:
        word2ind[k] = idx
        ind2word[idx] = k
        idx += 1
    
    return word2ind, ind2word

### Cleaning and Tokenizing Data

In [11]:
corpus = 'Who ❤️ "word embeddings" in 2022? I do!!!'

In [12]:
data = re.sub(r'[,!?;-]+', '.', corpus)
data

'Who ❤️ "word embeddings" in 2022. I do.'

In [13]:
data = word_tokenize(data)
print(f"after the tokenization:{data}")

after the tokenization:['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2022', '.', 'I', 'do', '.']


In [14]:
# clean the token
data = [x.lower() for x in data
        if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]
data

  if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]


['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']

In [15]:
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = word_tokenize(data)
    data = [x.lower() for x in data
        if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]
    
    return data 

### Sliding window of words

In [16]:
def get_windows(words, window_size):
    i = window_size
    while i < len(words)-window_size:
        center = words[i]
        context_words = words[(i-window_size):i] + words[(i+1):(i+window_size+1)]
        yield context_words, center
        i += 1

In [17]:
# test
for x, y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


In [18]:
for x, y in get_windows(tokenize("Now it's your turn: try with your own sentence! This is the last time"), 1):
    print(f'{x}\t{y}')

['now', 'your']	it
['it', 'turn']	your
['your', 'try']	turn
['turn', 'with']	try
['try', 'your']	with
['with', 'own']	your
['your', 'sentence']	own
['own', '.']	sentence
['sentence', 'this']	.
['.', 'is']	this
['this', 'the']	is
['is', 'last']	the
['the', 'time']	last


  if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]


### Transforming words into vectors

In [19]:
words = 'I am happy because I am learning'
test_token= tokenize(words)
print(test_token)
word2ind, ind2word = get_dict(test_token)  # sorted and indexed
print(word2ind)

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']
{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}


In [20]:
center_word_vector = np.zeros(len(word2ind))
happy_idx = word2ind['happy']
center_word_vector[happy_idx] = 1 
center_word_vector

array([0., 0., 1., 0., 0.])

In [21]:
def one_hot_vector(word, word2ind):
    vector = np.zeros(len(word2ind))
    vector[word2ind[word]] = 1
    return vector

In [22]:
one_hot_vector('learning', word2ind)

array([0., 0., 0., 0., 1.])

In [23]:
# center word: happy
# context word: ['i', 'am', 'because', 'i']
context_words = ['i', 'am', 'because', 'i']
context_vector = [one_hot_vector(x, word2ind) for x in context_words]
context_vector

[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [24]:
np.mean(context_vector, axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [25]:
def context_words_to_vector(context_words, word2ind):
    context_vector = [one_hot_vector(x, word2ind) for x in context_words]
    return np.mean(context_vector, axis=0)
    

In [26]:
# test
context_words_to_vector(['i', 'am', 'because', 'i'], word2ind)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

### A very simple training set

In [27]:
words

'I am happy because I am learning'

In [28]:
words_token = tokenize(words)
words_token

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [30]:
for context_words, center_word in get_windows(words_token, 2):
    print(f'Context words:  {context_words} -> {context_words_to_vector(context_words, word2ind)}')
    print(f'Center word:  {center_word} -> {one_hot_vector(center_word, word2ind)}')
    print()

Context words:  ['i', 'am', 'because', 'i'] -> [0.25 0.25 0.   0.5  0.  ]
Center word:  happy -> [0. 0. 1. 0. 0.]

Context words:  ['am', 'happy', 'i', 'am'] -> [0.5  0.   0.25 0.25 0.  ]
Center word:  because -> [0. 1. 0. 0. 0.]

Context words:  ['happy', 'because', 'am', 'learning'] -> [0.25 0.25 0.25 0.   0.25]
Center word:  i -> [0. 0. 0. 1. 0.]



In [33]:
def get_training_example(words, c, word2ind):
    for context_words, center_word in get_windows(words, c):
        yield context_words_to_vector(context_words, word2ind), one_hot_vector(center_word, word2ind)

In [35]:
# Print vectors associated to center and context words for corpus using the generator function
for context_words_vector, center_word_vector in get_training_example(words_token, 2, word2ind):
    print(f'Context words vector:  {context_words_vector}')
    print(f'Center word vector:  {center_word_vector}')
    print()

Context words vector:  [0.25 0.25 0.   0.5  0.  ]
Center word vector:  [0. 0. 1. 0. 0.]

Context words vector:  [0.5  0.   0.25 0.25 0.  ]
Center word vector:  [0. 1. 0. 0. 0.]

Context words vector:  [0.25 0.25 0.25 0.   0.25]
Center word vector:  [0. 0. 0. 1. 0.]



### Activation functions 

In [37]:
# we have 'i am happy because i am learning'
# vocabulary size = 5
# z1 = w1 x + b1 w1: 
z1 = 10 * np.random.rand(5, 1) - 5
print(z1.shape, z1)

(5, 1) [[-3.13199432]
 [ 3.15124387]
 [-1.37668968]
 [-3.36197041]
 [ 2.3675512 ]]


In [38]:
h = z1.copy()
h[h<0] = 0 
h

array([[0.        ],
       [3.15124387],
       [0.        ],
       [0.        ],
       [2.3675512 ]])

In [40]:
def relu(z):
    result = z.copy()
    result[result<0] = 0
    return result

In [41]:
# Define a new vector and save it in the 'z' variable
z = np.array([[-1.25459881], [ 4.50714306], [ 2.31993942], [ 0.98658484], [-3.4398136 ]])

# Apply ReLU to it
relu(z)

array([[0.        ],
       [4.50714306],
       [2.31993942],
       [0.98658484],
       [0.        ]])

In [42]:
def softmax(z):
    e_z = np.exp(z)
    sum_ez = np.sum(e_z)
    return e_z/sum_ez

In [43]:
softmax([9, 8, 11, 10, 8.5])

array([0.08276948, 0.03044919, 0.61158833, 0.22499077, 0.05020223])

In [44]:
np.sum(softmax([9, 8, 11, 10, 8.5])) == 1

True

### Training a model

In [45]:
# fix the size of the word embedding vector
N = 3
V = 5

In [47]:
# z = w1x + b, x.shape = 5x1, z.shape = 3x1, then w1 = 3x5
# y = w2h + b2, h.shape = 3x1, w2 = 5x3
W1 = np.random.rand(3, 5)
W1

array([[0.26753195, 0.58445581, 0.98514387, 0.11956682, 0.57574174],
       [0.44381793, 0.75855231, 0.17901368, 0.8219056 , 0.73204009],
       [0.66595379, 0.55649323, 0.18355884, 0.14809481, 0.22912759]])

In [50]:
b1 = np.random.rand(3, 1)
b1

array([[0.84014581],
       [0.75432424],
       [0.35306756]])

In [48]:
W2 = np.random.rand(5, 3)
W2

array([[0.19538232, 0.80931918, 0.10330246],
       [0.3506357 , 0.20557748, 0.45469874],
       [0.50281739, 0.04734632, 0.73640318],
       [0.63200585, 0.18889334, 0.86938257],
       [0.95444699, 0.77347653, 0.30710177]])

In [51]:
b2 = np.random.rand(5, 1)
b2

array([[0.04265582],
       [0.02695106],
       [0.68063716],
       [0.80160903],
       [0.8633817 ]])

In [53]:
words_token

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [54]:
print(word2ind, ind2word)

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4} {0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}


In [55]:
training_examples = get_training_example(words_token, 2, word2ind)

In [56]:
x_array, y_array = next(training_examples)
print(x_array, y_array)

[0.25 0.25 0.   0.5  0.  ] [0. 0. 1. 0. 0.]


In [61]:
# reshape the vector
x = x_array.copy()
x.shape = (V, 1)
y = y_array.copy()
y.shape = (V, 1)
print('x:\n', x, '\n', 'y:\n', y)

x:
 [[0.25]
 [0.25]
 [0.  ]
 [0.5 ]
 [0.  ]] 
 y:
 [[0.]
 [0.]
 [1.]
 [0.]
 [0.]]


In [62]:
W1 @ x

array([[0.27278035],
       [0.71154536],
       [0.37965916]])

In [63]:
# np.dot is also matrix multiplication
np.dot(W1, x)

array([[0.27278035],
       [0.71154536],
       [0.37965916]])

In [64]:
z1 = W1 @ x + b1
z1


array([[1.11292616],
       [1.4658696 ],
       [0.73272672]])

In [65]:
h = relu(z1)
h

array([[1.11292616],
       [1.4658696 ],
       [0.73272672]])

In [66]:
z2 = W2 @ h + b2
z2

array([[1.52215077],
       [1.0517024 ],
       [1.8492216 ],
       [2.41889773],
       [3.28444813]])

In [67]:
y_hat = softmax(z2)
y_hat

array([[0.08858147],
       [0.0553388 ],
       [0.12285362],
       [0.21716765],
       [0.51605846]])

In [68]:
ind2word[np.argmax(y_hat)]

'learning'

In [69]:
def cross_entropy_loss(y_predicted, y_actual):
    loss = np.sum(-np.log(y_predicted)*y_actual)  # element wise multiplication 
    return loss

In [70]:
cross_entropy_loss(y_hat, y)

2.096761752309792

### Backpropagation

In [71]:
grad_b2 = y_hat -y
grad_b2

array([[ 0.08858147],
       [ 0.0553388 ],
       [-0.87714638],
       [ 0.21716765],
       [ 0.51605846]])

In [72]:
grad_w2 = np.dot(y_hat - y, h.T)
grad_w2

array([[ 0.09858464,  0.12984888,  0.06490601],
       [ 0.061588  ,  0.08111947,  0.04054822],
       [-0.97619916, -1.28578222, -0.64270859],
       [ 0.24169156,  0.31833946,  0.15912454],
       [ 0.57433496,  0.75647441,  0.37812982]])

In [73]:
grad_b1 = relu(np.dot(W2.T, y_hat - y))
grad_b1

array([[0.22546823],
       [0.48171807],
       [0.        ]])

In [74]:
grad_w1 = np.dot(relu(np.dot(W2.T, y_hat-y)), x.T)
grad_w1

array([[0.05636706, 0.05636706, 0.        , 0.11273412, 0.        ],
       [0.12042952, 0.12042952, 0.        , 0.24085904, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

### Gradient Descent

In [75]:
# learning rate
alpha = 0.03

In [77]:
w1_update = W1 - alpha * grad_w1
print(grad_w1, '\n', w1_update)

[[0.05636706 0.05636706 0.         0.11273412 0.        ]
 [0.12042952 0.12042952 0.         0.24085904 0.        ]
 [0.         0.         0.         0.         0.        ]] 
 [[0.26584094 0.5827648  0.98514387 0.11618479 0.57574174]
 [0.44020505 0.75493943 0.17901368 0.81467983 0.73204009]
 [0.66595379 0.55649323 0.18355884 0.14809481 0.22912759]]
