## Word Embeddings with Neural Networks

In [3]:
import re
import nltk
import emoji
import numpy as np
from nltk.tokenize import word_tokenize

In [27]:
def get_dict(data):
    """
    Input: data a list of word indices
    Output: word dict
    """
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    
    word2ind = {}
    ind2word = {}
    for k in words:
        word2ind[k] = idx
        ind2word[idx] = k
        idx += 1
    
    return word2ind, ind2word

### Cleaning and Tokenizing Data

In [4]:
corpus = 'Who ❤️ "word embeddings" in 2022? I do!!!'

In [10]:
data = re.sub(r'[,!?;-]+', '.', corpus)
data

'Who ❤️ "word embeddings" in 2022. I do.'

In [11]:
data = word_tokenize(data)
print(f"after the tokenization:{data}")

after the tokenization:['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2022', '.', 'I', 'do', '.']


In [13]:
# clean the token
data = [x.lower() for x in data
        if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]
data

  if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]


['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']

In [14]:
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = word_tokenize(data)
    data = [x.lower() for x in data
        if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]
    
    return data 

### Sliding window of words

In [17]:
def get_windows(words, window_size):
    i = window_size
    while i < len(words)-window_size:
        center = words[i]
        context_words = words[(i-window_size):i] + words[(i+1):(i+window_size+1)]
        yield context_words, center
        i += 1

In [18]:
# test
for x, y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


In [23]:
for x, y in get_windows(tokenize("Now it's your turn: try with your own sentence! This is the last time"), 1):
    print(f'{x}\t{y}')

['now', 'your']	it
['it', 'turn']	your
['your', 'try']	turn
['turn', 'with']	try
['try', 'your']	with
['with', 'own']	your
['your', 'sentence']	own
['own', '.']	sentence
['sentence', 'this']	.
['.', 'is']	this
['this', 'the']	is
['is', 'last']	the
['the', 'time']	last


  if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]


### Transforming words into vectors

In [36]:
words = 'I am happy because I am learning'
test_token= tokenize(words)
print(test_token)
word2ind, ind2word = get_dict(test_token)  # sorted and indexed
print(word2ind)

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']
{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}


In [37]:
center_word_vector = np.zeros(len(word2ind))
happy_idx = word2ind['happy']
center_word_vector[happy_idx] = 1 
center_word_vector

array([0., 0., 1., 0., 0.])

In [38]:
def one_hot_vector(word, word2ind):
    vector = np.zeros(len(word2ind))
    vector[word2ind[word]] = 1
    return vector

In [39]:
one_hot_vector('learning', word2ind)

array([0., 0., 0., 0., 1.])

In [40]:
# center word: happy
# context word: ['i', 'am', 'because', 'i']
context_words = ['i', 'am', 'because', 'i']
context_vector = [one_hot_vector(x, word2ind) for x in context_words]
context_vector

[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [41]:
np.mean(context_vector, axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [42]:
def context_words_to_vector(context_words, word2ind):
    context_vector = [one_hot_vector(x, word2ind) for x in context_words]
    return np.mean(context_vector, axis=0)
    