In [1]:
import numpy as np
from utils2 import get_dict

In [2]:
N=3 # embedding size
V=5 # vocabulary size

In [3]:
# model weights initialization

In [4]:
# defining weights for input layer -> hidden layer connection [N x V]
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

# defining weights for hidden layer -> output layer connection [V x N]
W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

# defining bias for input layer -> hidden layer connection [N x 1]
b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

# defining bias for hidden layer -> output layer connection [V x 1]
b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

In [5]:
# defining some useful functions

In [6]:
words = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

word2Ind, Ind2word = get_dict(words)

def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [7]:
# creating generator -> sliding window of words

In [8]:
training_examples=get_training_example(words,2,word2Ind,V)

In [9]:
x_array,y_array=next(training_examples)

In [10]:
x_array

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [11]:
y_array

array([0., 0., 1., 0., 0.])

In [12]:
x=x_array.copy()

In [13]:
x=x.reshape(-1,1)

In [14]:
x.shape

(5, 1)

In [15]:
y=y_array.copy()

In [16]:
y=y.reshape(-1,1)

In [17]:
y.shape

(5, 1)

In [18]:
# defining activation functions: ReLU and softmax

In [19]:
def relu(z):
    result=z.copy()
    result[result<0]=0
    return result

def softmax(z):
    e_z=np.exp(z)
    sum_e_z=np.sum(e_z)
    return e_z/sum_e_z

In [20]:
# computing output of hidden layer

In [21]:
z1=np.dot(W1,x)+b1

In [22]:
z1

array([[ 0.36483875],
       [ 0.63710329],
       [-0.3236647 ]])

In [23]:
h=relu(z1)

In [24]:
h

array([[0.36483875],
       [0.63710329],
       [0.        ]])

In [25]:
# computing result of output layer

In [26]:
z2=np.dot(W2,h)+b2

In [27]:
z2

array([[-0.31973737],
       [-0.28125477],
       [-0.09838369],
       [-0.33512159],
       [-0.19919612]])

In [28]:
z2

array([[-0.31973737],
       [-0.28125477],
       [-0.09838369],
       [-0.33512159],
       [-0.19919612]])

In [29]:
y_hat=softmax(z2)

In [30]:
y_hat

array([[0.18519074],
       [0.19245626],
       [0.23107446],
       [0.18236353],
       [0.20891502]])

In [31]:
# defining cross-entropy loss

In [32]:
def cross_entropy(y_predicted,y_actual):
    return -np.sum(y_actual*np.log(y_predicted))

In [33]:
cross_entropy(y_hat,y)

1.4650152923611106

In [34]:
# learning phase - computing gradients for backpropagation

In [35]:
# gradient of bias_2

In [36]:
grad_b2=y_hat-y

In [37]:
grad_b2

array([[ 0.18519074],
       [ 0.19245626],
       [-0.76892554],
       [ 0.18236353],
       [ 0.20891502]])

In [40]:
# gradient of weights_2

In [38]:
grad_W2=np.dot(y_hat-y,h.T)

In [39]:
grad_W2

array([[ 0.06756476,  0.11798563,  0.        ],
       [ 0.0702155 ,  0.12261452,  0.        ],
       [-0.28053384, -0.48988499, -0.        ],
       [ 0.06653328,  0.1161844 ,  0.        ],
       [ 0.07622029,  0.13310045,  0.        ]])

In [41]:
# gradient of bias_1

In [42]:
grad_b1=relu(np.dot(W2.T,y_hat-y))

In [43]:
grad_b1

array([[0.        ],
       [0.        ],
       [0.17045858]])

In [44]:
# gradient of weights_1

In [45]:
grad_W1=np.dot(relu(np.dot(W2.T,y_hat-y)),x.T)

In [46]:
grad_W1

array([[0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.04261464, 0.04261464, 0.        , 0.08522929, 0.        ]])

In [47]:
# defining alpha hyperparameter

In [48]:
alpha=.03

In [49]:
W1_new=W1-alpha*grad_W1
b1_new=b1-alpha*grad_W1
W2_new=W2-alpha*grad_W2
b2_new=b2-alpha*grad_W2

In [52]:
print('W1: ',W1)
print('b1: ',b1)
print('W2: ',W2)
print('b2: ',b2)

W1:  [[ 0.41687358  0.08854191 -0.23495225  0.28320538  0.41800106]
 [ 0.32735501  0.22795148 -0.23951958  0.4117634  -0.23924344]
 [ 0.26637602 -0.23846886 -0.37770863 -0.11399446  0.34008124]]
b1:  [[ 0.09688219]
 [ 0.29239497]
 [-0.27364426]]
W2:  [[-0.22182064 -0.43008631  0.13310965]
 [ 0.08476603  0.08123194  0.1772054 ]
 [ 0.1871551  -0.06107263 -0.1790735 ]
 [ 0.07055222 -0.02015138  0.36107434]
 [ 0.33480474 -0.39423389 -0.43959196]]
b2:  [[ 0.0352008 ]
 [-0.36393384]
 [-0.12775555]
 [-0.34802326]
 [-0.07017815]]
