In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from miniml.ann import *
from miniml.ann.common import *
from miniml.ann.utils import *

**Data**

In [2]:
c = 2        # number of words in each sentence
d = 2        # embedding size
n_hidden = 2 # hidden size

sentences = ["i like games", "i love programming", "i hate raining"]

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict)  # size of vocabulary

**Transform to training format**

In [3]:
input_batch = []
target_batch = []

for sen in sentences:
    word = sen.split()
    input_ = [word_dict[n] for n in word[:-1]]
    target = word_dict[word[-1]]

    input_batch.append(input_)
    target_batch.append(target)

input_batch = np.array(input_batch)
target_batch = np.array(target_batch)

# one hot for target, because here we use MSE loss
b = np.zeros((target_batch.size, n_class))
b[np.arange(target_batch.size), target_batch] = 1
target_batch = np.copy(b)

**Initialize weights, biases, and embedding table**

In [4]:
W1 = np.random.randn(c*d, n_hidden)
b1 = np.random.randn(n_hidden)

W2 = np.random.randn(n_hidden, n_class)
W3 = np.random.randn(c*d, n_class)
b23 = np.random.randn(n_class)

embedding = np.random.randn(n_class, d)

**Build Neural Network Language Model**

In [5]:
x_node, y_node = Placeholder(), Placeholder()

table = Placeholder()

W1_node = Placeholder()
b1_node = Placeholder()

W2_node = Placeholder()
W3_node = Placeholder()
b23_node = Placeholder()

em_node = Embedding(x_node, table)
l1 = Linear(em_node, W1_node, b1_node)
a1 = Tanh(l1)
l2 = Linear(a1, W2_node, None)
l3 = Linear(em_node, W3_node, None)
add = Add(l2, l3, b23_node)
mse = MSE(y_node, add)

**Feed initial values, use topology to create ordered graph for forward and backward**

In [6]:
feed_dict = {
    x_node: input_batch,
    y_node: target_batch,
    
    table: embedding,
    
    W1_node: W1,
    b1_node: b1,

    W2_node: W2,
    W3_node: W3,
    b23_node: b23
}

graph = feed_dict_2_graph(feed_dict)    # network graph
sorted_graph = topology(graph)          # sorted graph
trainables = [table, W1_node, b1_node, W2_node, W3_node, b23_node]

**Training**

In [7]:
optimizer = 'Adam'
lr=1e-3
epochs=5000
batch_size = 3

steps_per_epoch = len(input_batch) // batch_size
losses = []

# only used for Adam
it = 0

for i in range(1, epochs+1):
    loss = 0
    for j in range(steps_per_epoch):
        it += 1
        # Step 4.1: sample a batch of examples and Reset value
        x_node.value = input_batch[j*batch_size:(j+1)*batch_size]
        y_node.value = target_batch[j*batch_size:(j+1)*batch_size]

        # Step 4.2: forward
        for n in sorted_graph:
            n.forward()

        # Step 4.3: backward
        for n in sorted_graph[::-1]:
            n.backward()

        # Step 4.4: optimization
        for t in trainables:
            t.optimize(optimizer=optimizer, lr=lr, it=it)

        # Step 5: update current loss
        loss += sorted_graph[-1].value

    if i % 500 == 0: 
        print("Epoch: {}, Loss: {:.4f}".format(i, loss/steps_per_epoch))
        losses.append(loss/steps_per_epoch)

Epoch: 500, Loss: 0.1286
Epoch: 1000, Loss: 0.0583
Epoch: 1500, Loss: 0.0518
Epoch: 2000, Loss: 0.0485
Epoch: 2500, Loss: 0.0468
Epoch: 3000, Loss: 0.0449
Epoch: 3500, Loss: 0.0371
Epoch: 4000, Loss: 0.0177
Epoch: 4500, Loss: 0.0029
Epoch: 5000, Loss: 0.0002


**Predict**

In [8]:
x_node.value = input_batch
for n in sorted_graph[:-1]:
    n.forward()
preds = [number_dict[idx] for idx in np.argmax(sorted_graph[-2].value, axis=1)]

print(f"inputs\tpredicts\n")
for i, sen in enumerate(sentences):
    print(f"{' '.join(sen.split()[:-1])} -> {preds[i]}")

inputs	predicts

i like -> games
i love -> programming
i hate -> raining
