## Word Embeddings with Neural Networks

In [3]:
import re
import nltk
import emoji
import numpy as np
from nltk.tokenize import word_tokenize

In [None]:
def get_dict(data):
    """
    Input: data a list of word indices
    Output: word dict
    """
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    
    word2ind = {}
    ind2word = {}
    for k in words:
        word2ind[k] = idx
        ind2word[idx] = k
        idx += 1
    
    return word2ind, ind2word

### Cleaning and Tokenizing Data

In [4]:
corpus = 'Who ❤️ "word embeddings" in 2022? I do!!!'

In [10]:
data = re.sub(r'[,!?;-]+', '.', corpus)
data

'Who ❤️ "word embeddings" in 2022. I do.'

In [11]:
data = word_tokenize(data)
print(f"after the tokenization:{data}")

after the tokenization:['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2022', '.', 'I', 'do', '.']


In [13]:
# clean the token
data = [x.lower() for x in data
        if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]
data

  if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]


['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']

In [14]:
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = word_tokenize(data)
    data = [x.lower() for x in data
        if x.isalpha() or x == '.' or emoji.get_emoji_regexp().search(x)]
    
    return data 

### Sliding window of words

In [15]:
def get_windows(words, window_size):
    i = window_size
    while i < len(words)-window_size:
        center = words[window_size]
        context_words = words[(i-window_size):i] + words[(i+1):(i+window_size+1)]
        yield context_words, center
        i += 1

In [None]:
# test
for x, y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')