In this notebook, we will attempt to generate reasonable sounding dinosaur names using a character level langauge model.


In [7]:
import numpy as np

The training data is contained in the file 'dinos.txt' and is comprised of 1536 dinosaur names.


In [8]:
train_data = open('dinos.txt', 'r').read().lower()
alphabet = list(set(train_data))      # Retrieve list of characters
doc_size = len(train_data)            # Calculate training data size
alphabet_size = len(alphabet)
print(doc_size, alphabet_size)




19909 27


We now create two dictionaries - char_dict maps an index to a character and inverse_char_dict maps a character to its index.

In [9]:
char_dict = {i:char for i, char in enumerate(sorted(alphabet))}
inverse_char_dict = {char:i for i, char in enumerate(sorted(alphabet))}
print(char_dict, inverse_char_dict)

{0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'} {'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


Next, we define a function that performs gradient clipping in order to prevent exploding gradients. We accomplish this using the numpy.clip function. https://docs.scipy.org/doc/numpy/reference/generated/numpy.clip.html

In [10]:
def clip_gradients(gradients, max_val):
    
    dwax = gradients['dwax']
    dwaa = gradients['dwaa']
    dwya = gradients['dwya']
    dba = gradients['dba']
    dby = gradients['dby']
    
    for grad in [dwax, dwaa, dwya, dba, dby]:
        np.clip(grad, a_min = -max_val, a_max = max_val, out=grad)
    
    gradients = {'dwax': dwax, 'dwaa': dwaa, 'dwya': dwya, 'dba': dba, 'dby': dby}
    return gradients

In [12]:
def gen_sample(params, inverse_char_dict, seed):
    """
    Generate a sequence of characters according to the probability distribution output from the RNN.
    
    Arguments:
    params -- dictionary containing the parameters wax, waa, wya, ba, by.
    inverse_char_dict -- dictionary mapping each character to its index
    seed -- seed for the randomizer

    Returns:
    indices -- list containing the indices of the sampled characters
    """
    
    indices = []
    cur_idx = -1
    count = 0
    
    wax = params['wax']
    waa = params['waa']
    wya = params['wya']
    ba = params['ba']
    by = params['by']
    
    alphabet_size = by.shape[0]
    act_size = waa.shape[1]
    
    X = np.zeros((alphabet_size, 1))
    a = np.zeros((act_size, 1))
    
    newline_idx = inverse_char_dict['\n']
    
    while(cur_idx != newline_idx and count < 50):
        
        a_new = np.tanh((np.matmul(wax, X) + np.matmul(waa, a) + ba))
        y = np.matmul(wya, a_new) + by
        expn = np.exp(y - np.max(y))
        y_predict = expn / expn.sum(axis=0)
        
        np.random.seed(count+seed)
        
        idx_list = list(inverse_char_dict.values())
        cur_idx = np.random.choice(idx_list, p = y_predict.flatten())
        indices.append(cur_idx)
        
        X = np.zeros((alphabet_size, 1))
        X[cur_idx] = 1
        a = a_new
        seed +=1
        count+=1 
    if(count == 50):
        indices.append(inverse_char_dict['\n'])
        
    return indices  

In [13]:
def rnn_forward_step(x_t, a, params):
    """
    Run one step of the forward progation at time step t
    
    Arguments:
    x_t -- input vector at time step t
    a -- activation from the previous time step
    params -- dictionary containing the parameters wax, waa, wya, ba, by
    Returns:
    a_new -- activation from the current time step
    y_predict_t -- output prediction from the current time step
    """
    wax = params['wax']
    waa = params['waa']
    wya = params['wya']
    ba = params['ba']
    by = params['by']
    
    a_new = np.tanh(np.matmul(wax, x_t) + np.matmul(waa, a) + ba)
    
    y = np.matmul(wya, a_new) + by
    expn = np.exp(y - np.max(y))
    y_predict_t = expn / expn.sum(axis=0)     #Compute softmax
    
    return a_new, y_predict_t

In [14]:
def rnn_forward(X, Y, a0, params, alphabet_size=27):
    """
    Run forward propagation through the RNN
    
    Arguments:
    X -- input vector containing character indices
    Y -- same as X, but shifted one position to the left
    a0 -- activation at time step 0
    params -- dictionary containing the parameters wax, waa, wya, ba, by
    alphabet_size -- number of unique characters
    Returns:
    loss -- cross entropy loss
    cache --  cache to be used during backpropagation
    """
    x = {}
    a = {}
    y_predict = {}
    loss = 0
    a[-1] = np.copy(a0)
    
    for t in range(len(X)):
        x[t] = np.zeros((alphabet_size, 1))
        if (X[t] != None):
            x[t][X[t]] = 1
        a[t], y_predict[t] = rnn_forward_step(x[t], a[t-1], params)
        loss -= np.log(y_predict[t][Y[t],0])
    cache = (y_predict, a, x)
    
    return loss, cache     

In [21]:
def rnn_backward_step(params, gradients, x, a, a_new, dy):
    """
    Run one step of backpropagation at time step t
    
    Arguments:
    params -- dictionary containing the parameters wax, waa, wya, ba, by
    gradients -- dictionary containing the gradients dwya, dby, dba, dwax, dwaa, da
    x -- input vector at time step t
    a -- activation at time step t-1
    a_new -- activation at time step t
    dy -- initial gradient
    Returns:
    gradients -- dictionary containing the gradients dwya, dby, dba, dwax, dwaa, da
    """
    gradients['dwya'] += np.dot(dy, a_new.T)
    gradients['dby'] += dy
    da = np.dot(params['wya'].T, dy) + gradients['da']
    dat = (1 - a_new * a_new) * da
    gradients['dba'] += dat
    gradients['dwax'] += np.dot(dat, x.T)
    gradients['dwaa'] += np.dot(dat, a.T)
    gradients['da'] = np.dot(params['waa'].T, dat)
    
    return gradients    

In [16]:
def rnn_backward(X, y, params, cache):
    """
    Run backpropagation through the network
    
    Arguments:
    X -- input vector containing character indices
    y -- same as X, but shifted one position to the left
    params -- dictionary containing the parameters wax, waa, wya, ba, by
    cache --  cache to be used during backpropagation
    Returns:
    gradients -- dictionary containing the gradients dwya, dby, dba, dwax, dwaa, da
    a -- final activation of the RNN
    """
    
    gradients = {}
    
    wya = params['wya']
    waa = params['waa']
    wax = params['wax']
    by = params['by']
    ba = params['ba']
    
    (y_predict, a, x) = cache
    
    gradients['dwya'] = np.zeros_like(wya)
    gradients['dwaa'] = np.zeros_like(waa)
    gradients['dwax'] = np.zeros_like(wax)
    gradients['dby'] = np.zeros_like(by)
    gradients['dba'] = np.zeros_like(ba)
    gradients['da'] = np.zeros_like(a[0])
    
    for t in reversed(range(len(X))):
        dy = np.copy(y_predict[t])
        dy[y[t]] -= 1
        gradients = rnn_backward_step(params, gradients, x[t], a[t-1], a[t], dy)
        
    return gradients, a   

In [17]:
def update_params(params, gradients, learning_rate):
    
    params['wya'] -= learning_rate * gradients['dwya']
    params['waa'] -= learning_rate * gradients['dwaa']
    params['wax'] -= learning_rate * gradients['dwax']
    params['by'] -= learning_rate * gradients['dby']
    params['ba'] -= learning_rate * gradients['dba']
    
    return params

In [18]:
def optimize(X, y, a, params, learning_rate):
    
    loss, cache = rnn_forward(X, y, a, params)
    gradients, a = rnn_backward(X, y, params, cache)
    gradients = clip_gradients(gradients, 5)
    params = update_params(params, gradients, learning_rate)
    
    return loss, gradients, a[len(X)-1]

In [19]:
def model(train_data, char_dict, inverse_char_dict, alphabet_size, num_samples, h_size, num_iterations):
    
    params = {}
    in_size = alphabet_size
    ou_size = alphabet_size
    
    np.random.seed(1)
    params['wya'] = np.random.randn(ou_size, h_size) * 0.01 
    params['waa'] = np.random.randn(h_size, h_size) * 0.01 
    params['wax'] = np.random.randn(h_size, in_size) * 0.01 
    params['by'] = np.zeros((ou_size, 1)) 
    params['ba'] = np.zeros((h_size, 1))
    
    a = np.zeros((h_size, 1))
    
    loss = -np.log(1.0/alphabet_size) * num_samples
    
    with open('dinos.txt') as f:
        examples = f.readlines()
    examples = [x.lower().strip() for x in examples]
    
    np.random.seed(0)
    np.random.shuffle(examples)
    
    for i in range(num_iterations):
        
        cur_idx = i % len(examples)
        X = [None] + [inverse_char_dict[char] for char in examples[cur_idx]]
        Y = X[1:] + [inverse_char_dict['\n']]
        
        cur_loss, gradients, a = optimize(X, Y, a, params, learning_rate=0.01)
        loss = loss * 0.999 + cur_loss * 0.001
        
        if i % 2000 == 0:
            
            print('Loss after iteration %d : %f' % (i, loss))
            
            seed = 0
            
            for j in range(num_samples):
                
                indices =  gen_sample(params, inverse_char_dict, seed)
                name = ''.join(char_dict[idx] for idx in indices)
                print('%s' % (name))
                
                seed += 1
                print('\n')
    
    return params        

In [22]:
params = model(train_data, char_dict, inverse_char_dict, alphabet_size=27, num_samples=7, h_size=50, num_iterations=35000)

Loss after iteration 0 : 23.087338
nkzxwtdmfqoeyhsqwasjkjvu



kneb



kzxwtdmfqoeyhsqwasjkjvu



neb



zxwtdmfqoeyhsqwasjkjvu



eb



xwtdmfqoeyhsqwasjkjvu



Loss after iteration 2000 : 27.913822
liustolldoravgpsrarinhsianthymechalujdhangaloltonp



hmcaberteecltksatotleeycertactorapherosaurus



hytrpclfppeurusacroresianusvihandes



lecalpsamanthrgdjus



xusjciloraurus



acalpsalanthrfenwneeycerun



troligoraurus



Loss after iteration 4000 : 25.842094
liusisaurus



inga



iusosaurus



madalosaurus



xuskchgosaurus



baagosaurus



tosaurus



Loss after iteration 6000 : 24.572623
onyuselmeroptis



kigcalosaurus



lytromonosaurus



olealosaurus



wussaurus



eeaisope



tosaurus



Loss after iteration 8000 : 24.087631
onxusberatons



loma



lytrodon



olaadosaurus



wussaurus



eeahron



trodon



Loss after iteration 10000 : 23.862407
nivusaurus



jnecaisaurosaurus



lustreodon



nea



vuslanesaurus



daadosaurus



trocherogurosaurus



Loss after iter

After 35,000 iterations, we are able to generate reasonably dinosaur-y sounding names like 'trohakosaurus', 'vustarasaurus', and 'maytosaurus'