In [1]:
import numpy as np

In [2]:
np.random.seed(0)

In [3]:
# data = "hello world"
data = open("./data/howtogetrich.txt", "r").read()[:1000]
chars = list(set(data))
vocab_size = len(chars)

chars_to_indices = {char: i for i, char in enumerate(chars)}
indices_to_chars = {i: char for i, char in enumerate(chars)}

chars_to_indices, indices_to_chars

({'M': 0,
  't': 1,
  ':': 2,
  'i': 3,
  'W': 4,
  'N': 5,
  'p': 6,
  'b': 7,
  'l': 8,
  '.': 9,
  'U': 10,
  '“': 11,
  'd': 12,
  'e': 13,
  'H': 14,
  'n': 15,
  '”': 16,
  ' ': 17,
  ')': 18,
  'c': 19,
  'r': 20,
  'k': 21,
  '’': 22,
  '\n': 23,
  'E': 24,
  ',': 25,
  's': 26,
  'G': 27,
  'y': 28,
  'V': 29,
  'o': 30,
  'P': 31,
  'Y': 32,
  'm': 33,
  'h': 34,
  '—': 35,
  'L': 36,
  'R': 37,
  'u': 38,
  '-': 39,
  'T': 40,
  'S': 41,
  'f': 42,
  'x': 43,
  'I': 44,
  'a': 45,
  'g': 46,
  'v': 47,
  'A': 48,
  'w': 49,
  '(': 50},
 {0: 'M',
  1: 't',
  2: ':',
  3: 'i',
  4: 'W',
  5: 'N',
  6: 'p',
  7: 'b',
  8: 'l',
  9: '.',
  10: 'U',
  11: '“',
  12: 'd',
  13: 'e',
  14: 'H',
  15: 'n',
  16: '”',
  17: ' ',
  18: ')',
  19: 'c',
  20: 'r',
  21: 'k',
  22: '’',
  23: '\n',
  24: 'E',
  25: ',',
  26: 's',
  27: 'G',
  28: 'y',
  29: 'V',
  30: 'o',
  31: 'P',
  32: 'Y',
  33: 'm',
  34: 'h',
  35: '—',
  36: 'L',
  37: 'R',
  38: 'u',
  39: '-',
  40: 'T',
  41:

In [4]:
# RNN configs
embedding_size = vocab_size
hidden_size = 64
output_size = vocab_size

In [5]:
# Matrices
Wxh = np.random.randn(hidden_size, embedding_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
bh = np.random.randn(1, hidden_size) * 0.01

Why = np.random.randn(output_size, hidden_size)
by = np.random.randn(1, output_size)

In [6]:
def softmax(z):
    e_z = np.exp(z)
    return e_z / e_z.sum(axis=1)

def rnn(x_t, h_prev):
    assert x_t.shape == (1, embedding_size)
    assert h_prev.shape == (1, hidden_size)

    h_next = np.tanh(x_t @ Wxh.T + h_prev @ Whh.T + bh)
    probs = softmax(h_next @ Why.T + by)
    
    return probs, h_next

def sample(char, n):
    # sample rnn n times starting with the first char
    x = np.zeros((1, vocab_size))
    idx = chars_to_indices[char]
    x[:, idx] = 1 # one hot encoding
    h =  np.zeros((1, hidden_size)) # hidden state
    idxes = []
    
    for i in range(n):
        probs, h = rnn(x, h) # xt, hprev -> rnn -> probs, hnext
        idx = np.random.choice(vocab_size, p=probs.ravel()) # .ravel returns a 1d array
        
        x = np.zeros((1, vocab_size))
        x[:, idx] = 1
        idxes.append(idx)

    chars = "".join([indices_to_chars[i] for i in idxes])
    return chars

In [7]:
x_t = np.random.randn(1, embedding_size)
h_prev = np.random.randn(1, hidden_size)

probs, h_next = rnn(x_t, h_prev)
probs.shape, h_next.shape

((1, 51), (1, 64))

In [8]:
inputs = [chars_to_indices[c] for c in data]
# target = [chars_to_indices[c] for c in data[1:]]

# inp, target

In [9]:
sample("h", 4)

'YoH”'

In [13]:
lr = 1e-1
p = 0
h = np.zeros((1, hidden_size))
seq_length = 25
for e in range(1000):
    if p + seq_length + 1 >= len(data):
        p = 0
        h = np.zeros((1, hidden_size))

    batch = inputs[p:p+seq_length]
    inp = batch[:-1]
    target = batch[1:]

    p += seq_length

    # compute loss
    loss = 0
    xs, ps, hs = {}, {}, {}
    # h = np.zeros((1, hidden_size))
    hs[-1] = h
    for t in range(len(inp)):
        idx = inp[t]
        x = np.zeros((1, vocab_size))
        x[:, idx] = 1
        probs, h = rnn(x, hs[t- 1])
        xs[t] = x
        ps[t] = probs # save probs (we'll use this in backprop)
        hs[t] = h

        # cross entropy
        pred = probs[0, target[t]]
        loss += -np.log(pred)

    # backprop, calculate gradients
    dL_dWxh = np.zeros_like(Wxh)
    dL_dWhh = np.zeros_like(Whh)
    dL_dbh = np.zeros_like(bh)
    dL_Why = np.zeros_like(Why)
    dL_dby = np.zeros_like(by)
    dF_dh = np.zeros((1, hidden_size))

    for t in reversed(range(len(inp))):
        dL_dz2 = np.copy(ps[t])
        target_idx = target[t]
        dL_dz2[:, target_idx] -= 1

        # print(dL_dz2.shape)

        a = hs[t] * dL_dz2.T
        # print(a.shape)

        # 2nd layer
        dL_Why += hs[t] * dL_dz2.T
        dL_dby += dL_dz2


        # 1st layer
        dh_dz1 = 1 - hs[t] ** 2
        dL_dh = dF_dh + (dL_dz2 @ Why) 

        # print((xs[t] * dh_dz1.T * dL_dh.T).shape)

        dL_dWxh += xs[t] * dh_dz1.T * dL_dh.T
        dL_dWhh +=  hs[t - 1] * dh_dz1.T * dL_dh.T
        dL_dbh += dh_dz1 * dL_dh

        # raise RuntimeError

        # dF_dh = dL_dh * (dh_dz1 @ Whh)
        dF_dh = dL_dh @ (dh_dz1 * Whh)

    # clip gradients
    for gradient in [dL_dWxh, dL_dWhh, dL_Why, dL_dbh, dL_dby]:
        np.clip(gradient, -1, 1, out=gradient)
        # np.clip(gradient, -5, 5, out=gradient)


    # gradient descent
    for weights, gradient in zip([Wxh, Whh, Why, bh, by], [dL_dWxh, dL_dWhh, dL_Why, dL_dbh, dL_dby]):
        weights -= lr * gradient

    # break
    
    print(f"[Loss]: {loss}")

[Loss]: 165.29609583758327
[Loss]: 162.19564827589093
[Loss]: 130.0120428850633
[Loss]: 96.23740301875846
[Loss]: 135.66817444730617
[Loss]: 201.12614287733717
[Loss]: 138.59423168708048
[Loss]: 146.7547486142861
[Loss]: 114.15315129369311
[Loss]: 197.59617179270822
[Loss]: 151.9297861475886
[Loss]: 135.2299863736409
[Loss]: 111.48716862075415
[Loss]: 118.20026470102833
[Loss]: 117.84926007951525
[Loss]: 103.16663302439181
[Loss]: 142.84386865951416
[Loss]: 109.05663475541046
[Loss]: 151.21655924540642
[Loss]: 167.99258749973256
[Loss]: 108.90479532764436
[Loss]: 134.35120736515935
[Loss]: 189.49936727029214
[Loss]: 140.0927227525816
[Loss]: 149.3860071822229
[Loss]: 127.02618264429321
[Loss]: 129.60954232275876
[Loss]: 114.47390634749546
[Loss]: 191.74134158599506
[Loss]: 161.91176803900333
[Loss]: 113.84198990648437
[Loss]: 129.16365543317713
[Loss]: 116.28906218884096
[Loss]: 153.17639007841996
[Loss]: 133.35736737613124
[Loss]: 165.93473546640033
[Loss]: 123.58575339465948
[Loss]: 

In [14]:
print(sample("S", 100))

eeH hia te tga te tra te tba te tba te tVa te tra te tba tiatba te tba te tia te tia te tva te tLa t
