In [1]:
import numpy as np

In [2]:
np.random.seed(0)

In [3]:
# data = "hello world"
data = open("./data/stevejobs_short.txt", "r").read()
chars = list(set(data))
vocab_size = len(chars)

chars_to_indices = {char: i for i, char in enumerate(chars)}
indices_to_chars = {i: char for i, char in enumerate(chars)}

chars_to_indices, indices_to_chars

({'k': 0,
  'z': 1,
  'b': 2,
  'i': 3,
  'S': 4,
  'w': 5,
  'n': 6,
  "'": 7,
  'v': 8,
  'a': 9,
  'u': 10,
  'f': 11,
  '―': 12,
  'J': 13,
  'c': 14,
  'Y': 15,
  'q': 16,
  'g': 17,
  'A': 18,
  ' ': 19,
  ',': 20,
  'r': 21,
  'l': 22,
  'B': 23,
  '\n': 24,
  's': 25,
  't': 26,
  'y': 27,
  '.': 28,
  'e': 29,
  'p': 30,
  'd': 31,
  'H': 32,
  'm': 33,
  'T': 34,
  'o': 35,
  '"': 36,
  'h': 37},
 {0: 'k',
  1: 'z',
  2: 'b',
  3: 'i',
  4: 'S',
  5: 'w',
  6: 'n',
  7: "'",
  8: 'v',
  9: 'a',
  10: 'u',
  11: 'f',
  12: '―',
  13: 'J',
  14: 'c',
  15: 'Y',
  16: 'q',
  17: 'g',
  18: 'A',
  19: ' ',
  20: ',',
  21: 'r',
  22: 'l',
  23: 'B',
  24: '\n',
  25: 's',
  26: 't',
  27: 'y',
  28: '.',
  29: 'e',
  30: 'p',
  31: 'd',
  32: 'H',
  33: 'm',
  34: 'T',
  35: 'o',
  36: '"',
  37: 'h'})

In [4]:
# RNN configs
embedding_size = vocab_size
hidden_size = 64
output_size = vocab_size

In [5]:
# Matrices
Wxh = np.random.randn(hidden_size, embedding_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
bh = np.random.randn(1, hidden_size) * 0.01

Why = np.random.randn(output_size, hidden_size)
by = np.random.randn(1, output_size)

In [6]:
def softmax(z):
    e_z = np.exp(z)
    return e_z / e_z.sum(axis=1)

def rnn(x_t, h_prev):
    assert x_t.shape == (1, embedding_size)
    assert h_prev.shape == (1, hidden_size)

    h_next = np.tanh(x_t @ Wxh.T + h_prev @ Whh.T + bh)
    probs = softmax(h_next @ Why.T + by)
    
    return probs, h_next

def sample(char, n):
    # sample rnn n times starting with the first char
    x = np.zeros((1, vocab_size))
    idx = chars_to_indices[char]
    x[:, idx] = 1 # one hot encoding
    h =  np.zeros((1, hidden_size)) # hidden state
    idxes = []
    
    for i in range(n):
        probs, h = rnn(x, h) # xt, hprev -> rnn -> probs, hnext
        idx = np.random.choice(vocab_size, p=probs.ravel()) # .ravel returns a 1d array
        
        x = np.zeros((1, vocab_size))
        x[:, idx] = 1
        idxes.append(idx)

    chars = "".join([indices_to_chars[i] for i in idxes])
    return chars

In [7]:
x_t = np.random.randn(1, embedding_size)
h_prev = np.random.randn(1, hidden_size)

probs, h_next = rnn(x_t, h_prev)
probs.shape, h_next.shape

((1, 38), (1, 64))

In [8]:
inp = [chars_to_indices[c] for c in data[:-1]]
target = [chars_to_indices[c] for c in data[1:]]

inp, target

([36,
  32,
  29,
  21,
  29,
  7,
  25,
  19,
  26,
  35,
  19,
  26,
  37,
  29,
  19,
  14,
  21,
  9,
  1,
  27,
  19,
  35,
  6,
  29,
  25,
  28,
  19,
  34,
  37,
  29,
  19,
  33,
  3,
  25,
  11,
  3,
  26,
  25,
  28,
  19,
  34,
  37,
  29,
  19,
  21,
  29,
  2,
  29,
  22,
  25,
  28,
  19,
  34,
  37,
  29,
  19,
  26,
  21,
  35,
  10,
  2,
  22,
  29,
  33,
  9,
  0,
  29,
  21,
  25,
  28,
  19,
  34,
  37,
  29,
  19,
  21,
  35,
  10,
  6,
  31,
  19,
  30,
  29,
  17,
  25,
  19,
  3,
  6,
  19,
  26,
  37,
  29,
  19,
  25,
  16,
  10,
  9,
  21,
  29,
  19,
  37,
  35,
  22,
  29,
  25,
  28,
  19,
  34,
  37,
  29,
  19,
  35,
  6,
  29,
  25,
  19,
  5,
  37,
  35,
  19,
  25,
  29,
  29,
  19,
  26,
  37,
  3,
  6,
  17,
  25,
  19,
  31,
  3,
  11,
  11,
  29,
  21,
  29,
  6,
  26,
  22,
  27,
  28,
  19,
  34,
  37,
  29,
  27,
  7,
  21,
  29,
  19,
  6,
  35,
  26,
  19,
  11,
  35,
  6,
  31,
  19,
  35,
  11,
  19,
  21,
  10,
  22,
  29,
  25,
  28,
  1

In [9]:
sample("h", 4)

'mtzp'

In [30]:
lr = 1e-2
for e in range(100):
    # compute loss
    loss = 0
    xs, ps, hs = {}, {}, {}
    h = np.zeros((1, hidden_size))
    hs[-1] = h
    for t in range(len(inp)):
        idx = inp[t]
        x = np.zeros((1, vocab_size))
        x[:, idx] = 1
        probs, h = rnn(x, h)
        xs[t] = x
        ps[t] = probs # save probs (we'll use this in backprop)
        hs[t] = h

        # cross entropy
        pred = probs[0, target[t]]
        loss += -np.log(pred)

    # backprop, calculate gradients
    dL_dWxh = np.zeros_like(Wxh)
    dL_dWhh = np.zeros_like(Whh)
    dL_dbh = np.zeros_like(bh)
    dL_Why = np.zeros_like(Why)
    dL_dby = np.zeros_like(by)
    dF_dh = np.zeros((1, hidden_size))

    for t in reversed(range(len(inp))):
        dL_dz2 = np.copy(ps[t])
        target_idx = target[t]
        dL_dz2[:, target_idx] -= 1

        # print(dL_dz2.shape)

        a = hs[t] * dL_dz2.T
        # print(a.shape)

        # 2nd layer
        dL_Why += hs[t] * dL_dz2.T
        dL_dby += dL_dz2


        # 1st layer
        dh_dz1 = 1 - hs[t] ** 2
        dL_dh = dF_dh + (dL_dz2 @ Why) 

        # print((xs[t] * dh_dz1.T * dL_dh.T).shape)

        dL_dWxh += xs[t] * dh_dz1.T * dL_dh.T
        dL_dWhh +=  hs[t - 1] * dh_dz1.T * dL_dh.T
        dL_dbh += dh_dz1 * dL_dh

        # raise RuntimeError

        # dF_dh = dL_dh * (dh_dz1 @ Whh)
        dF_dh = dL_dh @ (dh_dz1 * Whh)

    # clip gradients
    for gradient in [dL_dWxh, dL_dWhh, dL_Why, dL_dbh, dL_dby]:
        np.clip(gradient, -1, 1, out=gradient)
        # np.clip(gradient, -5, 5, out=gradient)

    # gradient descent
    for weights, gradient in zip([Wxh, Whh, Why, bh, by], [dL_dWxh, dL_dWhh, dL_Why, dL_dbh, dL_dby]):
        weights -= lr * gradient

    # break
    
    print(f"[Loss]: {loss}")

[Loss]: 55.55355196225796
[Loss]: 85.0918929458568
[Loss]: 69.58526108270716
[Loss]: 79.39248099119558
[Loss]: 79.25217452283492
[Loss]: 89.79288567727295
[Loss]: 71.62141747028227
[Loss]: 61.35284201684629
[Loss]: 60.701402391943006
[Loss]: 58.414376924308115
[Loss]: 86.09058295458632
[Loss]: 63.35048173800201
[Loss]: 73.68126706076407
[Loss]: 79.53445035862798
[Loss]: 84.0346631802875
[Loss]: 106.4881720691656
[Loss]: 63.61150568630106
[Loss]: 72.97427076552509
[Loss]: 56.01840460882438
[Loss]: 80.09338423292067
[Loss]: 72.96176470412452
[Loss]: 68.77088054511606
[Loss]: 101.68800835146175
[Loss]: 112.36573980867038
[Loss]: 80.29500182674695
[Loss]: 142.06938189680025
[Loss]: 117.99171213128606
[Loss]: 83.7469772964498
[Loss]: 94.75889904171497
[Loss]: 122.46755245083614
[Loss]: 93.55284713817247
[Loss]: 90.41154866838698
[Loss]: 100.5708685501675
[Loss]: 92.29834487603705
[Loss]: 86.48660750872584
[Loss]: 80.15325204130366
[Loss]: 105.53504464635222
[Loss]: 79.52934083079991
[Loss]:

In [46]:
def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((1, vocab_size))
    x[0, seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(x, Wxh.T) + np.dot(h, Whh.T) + bh)
        y = np.dot(h, Why.T) + by
        p = np.exp(y / 0.05) / np.sum(np.exp(y / 0.05))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((1, vocab_size))
        x[0, ix] = 1
        ixes.append(ix)
    return ixes

In [47]:
a = sample(np.zeros((1, hidden_size)), chars_to_indices['"'], 500)
b = "".join([indices_to_chars[i] for i in a])

print(b)

Here'h to the srazy sth the orev they can change the world, are the ones whin them, glorify or viffirw ts tre crazy enough to them. Because the wo."
― Steve Jobse they chane whgy tu the squghe tish to the srazy einlY. The ore them. BeA. Them,mand ongs. The ore them. Because they change things. They push the cnaveforwh d, igfyround ."Ty ones. The cro "he wor do inis them ag thing you can't do is ignot fom quly thing ore crazy en quare hing you can't do is ign to te they can the Yope s. The tro bs
