In [5]:
import numpy as np

In [6]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

### RNN Network of one cell for 1 timestep

In [2]:
def rnn_cell_forward(xt, a_prev, parameters):
    # Retrieve parameters from "parameters"
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    # compute next activation state using the formula given above
    a_next = np.tanh(np.dot(Waa,a_prev)+np.dot(Wax,xt)+ba)
    # compute output of the current cell using the formula given above
    yt_pred = softmax(np.dot(Wya,a_next)+by)   
    
    return yt_pred

### RNN Network for more than one time step


    Create a vector of zeros (aa) that will store all the hidden states computed by the RNN.
    Initialize the "next" hidden state as a0a0 (initial hidden state).
    Start looping over each time step, your incremental index is tt :
        Update the "next" hidden state and the cache by running rnn_cell_forward
        Store the "next" hidden state in aa (tthtth position)
        Store the prediction in y
        Add the cache to the list of caches
    Return aa, yy and caches


In [10]:
def rnn_forward(x, a0, parameters):
    
    # Retrieve dimensions from shapes of x and Wy
    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wya"].shape
    
    # initialize "a" and "y" with zeros (≈2 lines)
    a = np.zeros((n_a, m, T_x))
    y_pred = np.zeros((n_y, m, T_x))
    
    # Initialize a_next (≈1 line)
    a_next = a0
    
    # loop over all time-steps
    for t in range(T_x):
        # Update next hidden state, compute the prediction, get the cache (≈1 line)
        a_next, yt_pred  = rnn_cell_forward(x[:,:,t], a_next, parameters)
        # Save the value of the new "next" hidden state in a (≈1 line)
        a[:,:,t] = a_next
        # Save the value of the prediction in y (≈1 line)
        y_pred[:,:,t] = yt_pred
        
    return y_pred,a
    

In [None]:
np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

y_pred,a = rnn_forward(x, a0, parameters)
print("y_pred[1][3] =", y_pred[1][3])
print("y_pred.shape = ", y_pred.shape)

In [14]:
data = open('names.txt', 'r').read()
data= data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

There are 33 total characters and 15 unique characters in your data.


 In the cell below, we create a python dictionary (i.e., a hash table) to map each character to an index. This will help you figure out what index corresponds to what character in the probability distribution output of the softmax layer.

In [23]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
print(char_to_ix)

{'\n': 0, 'a': 1, 'b': 2, 'd': 3, 'e': 4, 'g': 5, 'h': 6, 'i': 7, 'l': 8, 'n': 9, 'o': 10, 'p': 11, 's': 12, 't': 13, 'w': 14}


In [24]:
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

{0: '\n', 1: 'a', 2: 'b', 3: 'd', 4: 'e', 5: 'g', 6: 'h', 7: 'i', 8: 'l', 9: 'n', 10: 'o', 11: 'p', 12: 's', 13: 't', 14: 'w'}


### Clip gradients to avoid exploiding gradients

In [25]:
def clip(gradients, maxValue):
    
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']
   
    # clip to mitigate exploding gradients, loop over [dWax, dWaa, dWya, db, dby]. (≈2 lines)
    for gradient in [dWax, dWaa, dWya, db, dby]:
        np.clip(gradient, -maxValue, maxValue, out=gradient)
    
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}
    
    return gradients

In [27]:
np.random.seed(3)
dWax = np.random.randn(5,3)*10
dWaa = np.random.randn(5,5)*10
dWya = np.random.randn(2,5)*10
db = np.random.randn(5,1)*10
dby = np.random.randn(2,1)*10
gradients = {"dWax": dWax, "dWaa": dWaa, "dWya": dWya, "db": db, "dby": dby}

In [28]:
dWax[1][2]

-3.5475897926898674

In [29]:
dWax[3][1]

-13.138647533626822

In [30]:
gradients = clip(gradients, 10)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWya\"][1][2] =", gradients["dWya"][1][2])
print("gradients[\"db\"][4] =", gradients["db"][4])
print("gradients[\"dby\"][1] =", gradients["dby"][1])

gradients["dWaa"][1][2] = 10.0
gradients["dWax"][3][1] = -10.0
gradients["dWya"][1][2] = 0.2971381536101662
gradients["db"][4] = [10.]
gradients["dby"][1] = [8.45833407]


### Sampling to predict future character