In [None]:
# Data Prep
import re
import nltk
import emoji
import numpy as np
from nltk.tokenize import word_tokenize
from utils2 import get_dict

In [None]:
# Define a corpus
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

data = re.sub(r'[,!?;-]+', '.', corpus)
print(data)

data = nltk.word_tokenize(data)
print(data)

data = [ ch.lower() for ch in data if ch.isalpha() or ch == '.' or emoji.get_emoji_regexp().search(ch)]
print(data)

In [None]:
# Define the 'tokenize' function that will include the steps previously seen
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)  # tokenize string to words
    data = [ ch.lower() for ch in data
             if ch.isalpha()
             or ch == '.'
             or emoji.get_emoji_regexp().search(ch)
           ]
    return data

In [None]:
corpus = 'I am happy because I am learning'
words = tokenize(corpus)
print(words)

In [None]:
tokenize("Now it's your turn: try with your own sentence!")

In [None]:
# Sliding window of words
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i-C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i+=1

for x, y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')

In [None]:
# Print 'context_words' and 'center_word' for any sentence with a 'context half-size' of 1
for x, y in get_windows(tokenize("hello, '' here is a crazy sentence!"), 1):
    print(f'{x}\t{y}')

In [None]:
# One hot center words
word2Ind, Ind2word = get_dict(words)

In [None]:
word2Ind

In [None]:
Ind2word

In [None]:
V = len(word2Ind)
center_word_vector = np.zeros(V)
n = word2Ind['happy']
center_word_vector[n] = 1
center_word_vector

In [None]:
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

word_to_one_hot_vector('happy', word2Ind, V)

In [None]:
word_to_one_hot_vector('learning', word2Ind, V)

In [None]:
# Context Words
context_words = ['i', 'am', 'because', 'i']
context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]

# Print one-hot vectors for each context word
context_words_vectors

In [None]:
np.mean(context_words_vectors, axis=0)

In [None]:
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

context_words_to_vector(['i', 'am', 'because', 'i'], word2Ind, V)

In [None]:
context_words_to_vector(['am', 'happy', 'i', 'am'], word2Ind, V)

In [None]:
# Build the training set
# Print vectors associated to center and context words for corpus
for context_words, center_word in get_windows(words, 2):  # reminder: 2 is the context half-size
    print(f'Context words:  {context_words} -> {context_words_to_vector(context_words, word2Ind, V)}')
    print(f'Center word:  {center_word} -> {word_to_one_hot_vector(center_word, word2Ind, V)}')
    print()

In [None]:
# Define generator function
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

for context_words_vector, center_word_vector in get_training_example(words, 2, word2Ind, V):
    print(f'Context words vector:  {context_words_vector}')
    print(f'Center word vector:  {center_word_vector}')
    print()

# Notebook 2
Model architecture, activation functions, working with Numpy

In [None]:
import numpy as np

In [None]:
np.random.seed(10)

z_1 = 10*np.random.rand(5, 1) - 5

z_1

In [None]:
# Relu
h = z_1.copy()
h[h < 0] = 0
h

In [None]:
def relu(z):
    result = z.copy()
    result[result < 0] = 0
    return result

# Define a new vector and save it in the 'z' variable
z = np.array([[-1.25459881], [ 4.50714306], [ 2.31993942], [ 0.98658484], [-3.4398136 ]])

# Apply ReLU to it
relu(z)

In [None]:
# Softmax
z = np.array([9, 8, 11, 10, 8.5])
print(z)
e_z = np.exp(z)
print(e_z)
sum_e_z = np.sum(e_z)
print(e_z[0]/sum_e_z)

def softmax(z):
    e_z = np.exp(z)
    sum_e_z = np.sum(e_z)
    return e_z / sum_e_z

softmax([9, 8, 11, 10, 8.5])

In [None]:
np.sum(softmax([9, 8, 11, 10, 8.5]))

In [None]:
# Dimensions

# Define V (size of the vocabulary)
V = 5
x_array = np.zeros(V)
print(x_array)
print(x_array.shape)

# to do matmul, you need column vectors to be matrix with one column
x_column_vector = x_array.copy()
x_column_vector.shape = (V, 1)
print(x_column_vector)
print(x_column_vector.shape)

In [None]:
x_column_vector.shape

# Training the CBOW model

In [None]:
import numpy as np
from utils2 import get_dict

# Size of the word embedding
N = 3

# Size of the vocabulary
V = 5

# Define first matrix of weights
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

# Define second matrix of weights
W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

# Define first vector of biases
b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

# Define second vector of biases
b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of W1: {W1.shape} (NxV)')
print(f'size of b1: {b1.shape} (Nx1)')
print(f'size of W2: {W2.shape} (VxN)')
print(f'size of b2: {b2.shape} (Vx1)')

In [None]:
# Define the tokenized version of the corpus
words = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

# Get 'word2Ind' and 'Ind2word' dictionaries for the tokenized corpus
word2Ind, Ind2word = get_dict(words)

# Define the 'get_windows' function as seen in a previous notebook
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

# Define the 'word_to_one_hot_vector' function as seen in a previous notebook
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

# Define the 'context_words_to_vector' function as seen in a previous notebook
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

# Define the generator function 'get_training_example' as seen in a previous notebook
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [None]:
training_examples = get_training_example(words, 2, word2Ind, V)

In [None]:
x_array, y_array = next(training_examples)
print(x_array, y_array)

In [None]:
# Convert vectors into matrices to be able to perform matmul
x = x_array.copy()
x.shape = (V,1)
print(x)
y = y_array.copy()
y.shape = (V,1)
print(y)

In [None]:
# Define the 'relu' function as seen in the previous lecture notebook
def relu(z):
    result = z.copy()
    result[result < 0] = 0
    return result

# Define the 'softmax' function as seen in the previous lecture notebook
def softmax(z):
    e_z = np.exp(z)
    sum_e_z = np.sum(e_z)
    return e_z / sum_e_z

In [None]:
z1 = np.dot(W1, x) + b1
print(z1)
h = relu(z1)
print(h)

In [None]:
z2 = np.dot(W2,h) + b2
print(z2)
y_hat = softmax(z2)
print(y_hat)
print(Ind2word[np.argmax(y_hat)])

In [None]:
# Cross-entropy loss
print(y_hat)
print(y)



def cross_entropy_loss_1(y_predicted, y_actual):
    return -1 * np.dot(y_actual.T, np.log(y_predicted))

def cross_entropy_loss(y_predicted, y_actual):
    return np.sum(-np.log(y_hat) * y)

print(cross_entropy_loss_1(y_hat, y))
print(cross_entropy_loss(y_hat, y))

In [None]:
# Backprop
grad_b2 = y_hat - y
print(grad_b2)
grad_W2 = np.dot(y_hat - y, h.T)
print(grad_W2)
grad_b1 = relu(np.dot(W2.T, y_hat - y))
print(grad_b1)
grad_W1 = np.dot(relu(np.dot(W2.T, y_hat - y)), x.T)
print(grad_W1)

In [None]:
print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of grad_W1: {grad_W1.shape} (NxV)')
print(f'size of grad_b1: {grad_b1.shape} (Nx1)')
print(f'size of grad_W2: {grad_W2.shape} (VxN)')
print(f'size of grad_b2: {grad_b2.shape} (Vx1)')

In [None]:
alpha = 0.03
W1_new = W1 - alpha * grad_W1
print('old value of W1:')
print(W1)
print()
print('new value of W1:')
print(W1_new)

In [None]:
# Compute updated W2
W2_new = W2 - alpha * grad_W2

# Compute updated b1
b1_new = b1 - alpha * grad_b1

# Compute updated b2
b2_new = b2 - alpha * grad_b2


print('W2_new')
print(W2_new)
print()
print('b1_new')
print(b1_new)
print()
print('b2_new')
print(b2_new)

# Word Embeddings
Extracting word embedding vectors from a model

In [None]:
import numpy as np
from utils2 import get_dict

words = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']
V = 5 # size of the vocabulary

word2Ind, Ind2word = get_dict(words)

# Define first matrix of weights
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

# Define second matrix of weights
W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

# Define first vector of biases
b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

# Define second vector of biases
b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

In [None]:
# Option 1: extract embedding vectors from W1
W1

In [None]:
for i in range(V):
    print(Ind2word[i])

In [None]:
for word in word2Ind:
    word_embedding_vector = W1[:,word2Ind[word]]
    print(word, word_embedding_vector)

In [None]:
# Option 2: Extract from rows in W2
print(W2.T)

for word in word2Ind:
    word_embedding_vector = W2.T[:,word2Ind[word]]
    print(word, word_embedding_vector)

In [None]:
# Option 3: average W1 and W2

W3 = (W1 + W2.T)/2
print(W3)

for word in word2Ind:
    word_embedding_vetor = W3[:,word2Ind[word]]
    print(word, word_embedding_vector)