In [1]:
import numpy as np

In [2]:
np.random.seed(0)

In [3]:
# data = "hello world"
data = open("./data/howtogetrich.txt", "r").read()[:1000]
chars = list(set(data))
vocab_size = len(chars)

char_to_ix = {char: i for i, char in enumerate(chars)}
ix_to_char = {i: char for i, char in enumerate(chars)}

vocab_size

51

In [4]:
# RNN configs
embedding_size = vocab_size
hidden_size = 64
output_size = vocab_size

In [5]:
# Matrices
Wxh = np.random.randn(hidden_size, embedding_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
bh = np.random.randn(1, hidden_size) * 0.01

Why = np.random.randn(output_size, hidden_size)
by = np.random.randn(1, output_size)

In [6]:
def softmax(z):
    e_z = np.exp(z)
    return e_z / e_z.sum(axis=1)

def rnn(x_t, h_prev):
    assert x_t.shape == (1, embedding_size)
    assert h_prev.shape == (1, hidden_size)

    h_next = np.tanh(x_t @ Wxh.T + h_prev @ Whh.T + bh)
    probs = softmax(h_next @ Why.T + by)
    
    return probs, h_next

def sample(char, n):
    # sample rnn n times starting with the first char
    x = np.zeros((1, vocab_size))
    idx = char_to_ix[char]
    x[:, idx] = 1 # one hot encoding
    h =  np.zeros((1, hidden_size)) # hidden state
    idxes = []
    
    for i in range(n):
        probs, h = rnn(x, h) # xt, hprev -> rnn -> probs, hnext
        idx = np.random.choice(vocab_size, p=probs.ravel()) # .ravel returns a 1d array
        
        x = np.zeros((1, vocab_size))
        x[:, idx] = 1
        idxes.append(idx)

    chars = "".join([ix_to_char[i] for i in idxes])
    return chars

In [7]:
inputs = [char_to_ix[c] for c in data]

In [11]:
def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((1, vocab_size))  # encode in 1-of-k representation
        xs[t][0, inputs[t]] = 1
        hs[t] = np.tanh(
            np.dot(xs[t], Wxh.T) + np.dot(hs[t - 1], Whh.T) + bh
        )  # hidden state
        ys[t] = (
            np.dot(hs[t], Why.T) + by
        )  # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
        loss += -np.log(ps[t][0, targets[t]])  # softmax (cross-entropy loss)
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[
            0, targets[t]
        ] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy.T, hs[t])
        dby += dy

        dh = np.dot(dy, Why) + dhnext  # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity

        dbh += dhraw
        dWxh += np.dot(dhraw.T, xs[t])
        dWhh += np.dot(dhraw.T, hs[t - 1])


        dhnext = np.dot(dhraw, Whh)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]

In [12]:
seq_length = 25
learning_rate = 1e-1

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p + seq_length + 1 >= len(data) or n == 0:
        hprev = np.zeros((1, hidden_size))  # reset RNN memory
        p = 0  # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p : p + seq_length]]
    targets = [char_to_ix[ch] for ch in data[p + 1 : p + seq_length + 1]]

    # sample from the model now and then
    # if n % 100 == 0:
        # sample_ix = sample(hprev, inputs[0], 200)
        # txt = "".join(ix_to_char[ix] for ix in sample_ix)
        # print("----\n %s \n----" % (txt,))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0:
        # print("iter %d, loss: %f" % (n, smooth_loss))  # print progress
        print("iter %d, loss: %f" % (n, smooth_loss))  # print progress

    # perform parameter update with Adagrad
    for param, dparam, mem in zip(
        [Wxh, Whh, Why, bh, by],
        [dWxh, dWhh, dWhy, dbh, dby],
        [mWxh, mWhh, mWhy, mbh, mby],
    ):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

    p += seq_length  # move data pointer
    n += 1  # iteration counter

iter 0, loss: 98.197775
iter 100, loss: 99.640193
iter 200, loss: 96.690042
iter 300, loss: 93.293468
iter 400, loss: 89.683637
iter 500, loss: 86.037154
iter 600, loss: 82.389573
iter 700, loss: 78.699222
iter 800, loss: 75.065776
iter 900, loss: 71.609614
iter 1000, loss: 68.104286
iter 1100, loss: 64.818270
iter 1200, loss: 61.567906
iter 1300, loss: 58.500815
iter 1400, loss: 55.599214
iter 1500, loss: 52.773560
iter 1600, loss: 50.126014
iter 1700, loss: 47.531117
iter 1800, loss: 45.055649
iter 1900, loss: 42.680517
iter 2000, loss: 40.464256
iter 2100, loss: 38.364957
iter 2200, loss: 36.392113
iter 2300, loss: 34.515801
iter 2400, loss: 32.704976
iter 2500, loss: 30.976247
iter 2600, loss: 29.331113
iter 2700, loss: 27.801525
iter 2800, loss: 26.335281
iter 2900, loss: 24.972826
iter 3000, loss: 23.703123
iter 3100, loss: 22.509711
iter 3200, loss: 21.455430
iter 3300, loss: 20.409453
iter 3400, loss: 19.399847
iter 3500, loss: 18.421561
iter 3600, loss: 17.593056
iter 3700, lo

KeyboardInterrupt: 

In [9]:
print(sample("S", 100))

twererorobeccoutwingeepl at hecckerytuntiterowitt aly ava an gere topriknderitwavan avanst. abaval a
