In [1]:
import numpy as np

In [2]:
np.random.seed(0)

In [3]:
data = "hello world"
chars = list(set(data))
vocab_size = len(chars)

chars_to_indices = {char: i for i, char in enumerate(chars)}
indices_to_chars = {i: char for i, char in enumerate(chars)}

chars_to_indices, indices_to_chars

({'l': 0, 'h': 1, ' ': 2, 'd': 3, 'r': 4, 'o': 5, 'e': 6, 'w': 7},
 {0: 'l', 1: 'h', 2: ' ', 3: 'd', 4: 'r', 5: 'o', 6: 'e', 7: 'w'})

In [4]:
# RNN configs
embedding_size = vocab_size
hidden_size = 64
output_size = vocab_size

In [5]:
# Matrices
Wxh = np.random.randn(embedding_size, hidden_size)
Whh = np.random.randn(hidden_size, hidden_size)
bh = np.random.randn(1, hidden_size)

Why = np.random.randn(hidden_size, output_size)
by = np.random.randn(1, output_size)

In [6]:
def softmax(z):
    e_z = np.exp(z)
    return e_z / e_z.sum(axis=1)

def rnn(x_t, h_prev):
    assert x_t.shape == (1, embedding_size)
    assert h_prev.shape == (1, hidden_size)

    h_next = np.tanh(x_t @ Wxh + h_prev @ Whh + bh)
    probs = softmax(h_next @ Why + by)
    
    return probs, h_next

def sample(char, n):
    # sample rnn n times starting with the first char
    x = np.zeros((1, vocab_size))
    idx = chars_to_indices[char]
    x[:, idx] = 1 # one hot encoding
    h =  np.zeros((1, hidden_size)) # hidden state
    idxes = []
    
    for i in range(n):
        probs, h = rnn(x, h) # xt, hprev -> rnn -> probs, hnext
        idx = np.random.choice(vocab_size, p=probs.ravel()) # .ravel returns a 1d array
        
        x = np.zeros((1, vocab_size))
        x[:, idx] = 1
        idxes.append(idx)

    chars = "".join([indices_to_chars[i] for i in idxes])
    return chars

In [7]:
x_t = np.random.randn(1, embedding_size)
h_prev = np.random.randn(1, hidden_size)

probs, h_next = rnn(x_t, h_prev)
probs.shape, h_next.shape

((1, 8), (1, 64))

In [8]:
inp = [chars_to_indices[c] for c in data[:-1]]
target = [chars_to_indices[c] for c in data[1:]]

inp, target

([1, 6, 0, 0, 5, 2, 7, 5, 4, 0], [6, 0, 0, 5, 2, 7, 5, 4, 0, 3])

In [9]:
sample("h", 4)

'oleo'

In [12]:
lr = 1e-2
for e in range(100):
    # compute loss
    loss = 0
    xs, ps, hs = {}, {}, {}
    h = np.zeros((1, hidden_size))
    hs[-1] = h
    for t in range(len(inp)):
        idx = inp[t]
        x = np.zeros((1, vocab_size))
        x[:, idx] = 1
        probs, h = rnn(x, h)
        xs[t] = x
        ps[t] = probs # save probs (we'll use this in backprop)
        hs[t] = h

        # cross entropy
        pred = probs[0, target[t]]
        loss += -np.log(pred)

    # backprop, calculate gradients
    dL_dWxh = np.zeros_like(Wxh)
    dL_dWhh = np.zeros_like(Whh)
    dL_dbh = np.zeros_like(bh)
    dL_Why = np.zeros_like(Why)
    dL_dby = np.zeros_like(by)
    dF_dh = np.zeros((1, hidden_size))

    for t in reversed(range(len(inp))):
        dL_dz2 = np.copy(ps[t])
        target_idx = target[t]
        dL_dz2[:, target_idx] -= 1

        # 2nd layer
        dL_Why += dL_dz2 * hs[t].T
        dL_dby += dL_dz2

        # 1st layer
        dh_dz1 = 1 - hs[t] ** 2
        dL_dh = (dL_dz2 @ Why.T) + dF_dh 

        dL_dWxh += dL_dh * dh_dz1 * xs[t].T
        dL_dWhh += dL_dh * dh_dz1 * hs[t - 1].T
        dL_dbh += dL_dh * dh_dz1

        # dF_dh = dL_dh * (dh_dz1 @ Whh.T)
        dF_dh = dL_dh @ (dh_dz1 * Whh.T)

    # clip gradients
    for gradient in [dL_dWxh, dL_dWhh, dL_Why, dL_dbh, dL_dby]:
        np.clip(gradient, -1, 1, out=gradient)

    # gradient descent
    for weights, gradient in zip([Wxh, Whh, Why, bh, by], [dL_dWxh, dL_dWhh, dL_Why, dL_dbh, dL_dby]):
        weights -= lr * gradient

    # break
    
    print(f"[Loss]: {loss}")

[Loss]: 0.17451268609323645
[Loss]: 0.17375909108688162
[Loss]: 0.13385497918001746
[Loss]: 0.14383886885469332
[Loss]: 0.1391847800572504
[Loss]: 0.1366983881199949
[Loss]: 0.26720541103775375
[Loss]: 0.0909076572658633
[Loss]: 0.0758726817403176
[Loss]: 0.06507753096845878
[Loss]: 0.05915160656921544
[Loss]: 0.055875382546293886
[Loss]: 0.049564737177157
[Loss]: 0.04413006234874034
[Loss]: 0.04079030082078297
[Loss]: 0.03835732821817593
[Loss]: 0.035424863811461454
[Loss]: 0.03373623720239437
[Loss]: 0.03309506088049098
[Loss]: 0.03279287847058787
[Loss]: 0.032736693861392045
[Loss]: 0.03284918538597191
[Loss]: 0.03305240385748944
[Loss]: 0.033283815686461216
[Loss]: 0.033497110560828484
[Loss]: 0.03365878784091868
[Loss]: 0.03374674201520487
[Loss]: 0.03374628801761764
[Loss]: 0.0336458873208233
[Loss]: 0.03343422471885246
[Loss]: 0.03309917331253082
[Loss]: 0.032628814544167266
[Loss]: 0.03201460456391382
[Loss]: 0.0312566165990592
[Loss]: 0.03037071240746767
[Loss]: 0.029396024419

In [13]:
sample("h", 10)

'ello world'