In [4]:
import re
import nltk
import emoji
import numpy as np
from nltk.tokenize import word_tokenize
from utils2 import get_dict

In [5]:
# cleaning and tokenization

In [6]:
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

In [8]:
print(f'Corpus: {corpus}')
data=re.sub(r'[,!?;-]+','.',corpus)
print(f'After cleaning punctuation: {data}')

Corpus: Who ❤️ "word embeddings" in 2020? I do!!!
After cleaning punctuation: Who ❤️ "word embeddings" in 2020. I do.


In [9]:
print(f'Initial string: {data}')
data=nltk.word_tokenize(data)
print(f'After tokenization: {data}')

Initial string: Who ❤️ "word embeddings" in 2020. I do.
After tokenization: ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']


In [14]:
data=[ch.lower() for ch in data if ch.isalpha() or ch=='.' or emoji.get_emoji_regexp().search(ch)]
print(f'Data after cleaning: {data}')

Data after cleaning: ['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']


In [15]:
# enclosing into a function
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)
    data = [ ch.lower() for ch in data
             if ch.isalpha()
             or ch == '.'
             or emoji.get_emoji_regexp().search(ch)
           ]
    return data

In [19]:
corpus="I am happy because I am learning"
words=tokenize(corpus)

In [20]:
print(f'Words (tokens): {words}')

Words (tokens): ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [21]:
# sliding window of words

In [22]:
def get_windows(words,C):
    i=C
    while i<len(words)-C:
        center_word=words[i]
        context_words=words[i-C:i]+words[i+1:i+C+1]
        yield context_words, center_word
        i+=1

In [24]:
for x,y in get_windows(words,2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


In [25]:
# mapping words to indices and opposite schema -> indices to words

In [26]:
word2Ind,Ind2word=get_dict(words)

In [27]:
word2Ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

In [28]:
Ind2word

{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}

In [29]:
V=len(word2Ind)
print(f'Size of vocabulary: {V}')

Size of vocabulary: 5


In [30]:
# defining one-hot vector for each indexed word

In [31]:
def word_to_one_hot_vector(word,word2Ind,V):
    one_hot_vector=np.zeros(V)
    one_hot_vector[word2Ind[word]]=1
    return one_hot_vector

In [32]:
word_to_one_hot_vector('happy',word2Ind,V)

array([0., 0., 1., 0., 0.])

In [33]:
# context word vector

In [34]:
context_words=['i','am','because','i']

In [35]:
context_words_vectors=[word_to_one_hot_vector(w,word2Ind,V) for w in context_words]

In [36]:
context_words_vectors

[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [37]:
np.mean(context_words_vectors,axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [38]:
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

In [39]:
context_words_to_vector(['i', 'am', 'because', 'i'], word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])