# char-RNNs for baby name learning

So the motivation behind this is learning more about character-level RNNs and implementing my first for the purpose of randomly generating names for my amusement (sad I know ...). Inspiration is drawn from [Andrej Karpathy's excellent blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).

In [86]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn
import pandas as pd
from __future__ import division 
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
%matplotlib inline

# A minimal character-level Vanilla RNN model with fixed input length

In [2]:
# read the data
baby_names = pd.read_csv("data/NationalNames.csv")
baby_names.head()

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746


In [17]:
# let's just keep the names for now
names = baby_names.Name.values
length_five = names[np.array([len(n) for n in names])==5]
chars = "".join(length_five) # gonna keep uppercase, lowercase for now
vocab = set(list(chars))
vocab_size = len(vocab)

print("Number of characters: {}".format(len(chars)))
print("Number of unique characters: {}".format(vocab_size)) # As many as English alphabet both upper and lowercase

Number of characters: 1912720
Number of unique characters: 52


In [18]:
# make a mapping from indices to characters and back
dictionary = dict(enumerate(vocab))
reverse_dictionary = {value:key for key, value in dictionary.iteritems()}

In [19]:
# some hyperparameters
hidden_size = 100 # size of the hidden state of the RNN  
seq_length = 5 # number of steps to unroll RNN
lr = 1e-2

In [20]:
# Define placeholders
inputs = tf.placeholder(tf.float32, [None, vocab_size], name='inputs') # ohe representation of the character
targets = tf.placeholder(tf.float32, [None, vocab_size], name='outputs') # ohe representation of the character
init_state = tf.placeholder(shape=[1, hidden_size], dtype=tf.float32, name='input_state')

In [21]:
# Define some variables
Whh = tf.Variable(tf.truncated_normal([hidden_size, hidden_size], stddev=0.01))
Wxh = tf.Variable(tf.truncated_normal([vocab_size, hidden_size], stddev=0.01))
Why = tf.Variable(tf.truncated_normal([hidden_size, vocab_size], stddev=0.01))
bh = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=(1, hidden_size)))
by = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=(1, vocab_size)))

In [22]:
# Feed input and update hidden state
# We pass one name of length 5 at a time

preds = []
hidden_state = init_state
for t, x_t in enumerate(tf.split(inputs, seq_length, 0)):
    # update hidden state
    hidden_state = tf.tanh(tf.matmul(x_t, Wxh) + tf.matmul(hidden_state, Whh) + bh)
    
    # update the output, only the last thing will predict something
    output = tf.matmul(hidden_state, Why) + by
    preds.append(output)  

preds = preds[:-1]
outputs = tf.concat(preds, axis=0)

In [23]:
# Setup the loss

# your regular cross-entropy as the loss
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=outputs))

# Adam for starters
train_step = tf.train.AdamOptimizer().minimize(loss)

In [24]:
def word2mat(w):
    "Given a name, it returns a <name length> by <vocab size> matrix"
    mat = np.zeros([len(w), vocab_size])
    
    for ix in range(len(w)):
        char = w[ix]
        char_id = reverse_dictionary[char]
        mat[ix, char_id] = 1
    return mat

def mat2word(mat):
    return "".join([dictionary[ix] for ix in np.argmax(mat, 1)])

In [25]:
# train the network
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run([init])

l = []

n_iter = len(length_five)
h_last = np.zeros([1, hidden_size])
n_epochs = 20

for e in range(n_epochs):
    # shuffle training set in between epochs
    #length_five = np.random.permutation(length_five)
    for ix in range(n_iter):
        w = length_five[ix]
        mat = word2mat(w)
        _, l_new, h_last = sess.run([train_step, loss, hidden_state], feed_dict={inputs:mat, targets:mat[1:, :],
                                                                               init_state:h_last})
        l.append(l_new)

        if ix%10000==0:
            print("Step:{} - Loss: {}".format(ix, np.mean(l[:-10000]))) # track the mean loss to see if we are learning anything
            # generate 10 names and print them
    #         names = [generate_random_name() for _ in range(5)]
    #         print("Names generated: {}".format(" ".join(names)))
            out_ = sess.run(outputs, feed_dict={inputs:mat, targets:mat[1:, :], init_state:h_last})
            print("Wanted to predict: ", mat2word(mat[:, :]))
            print("Got: ", mat2word(mat[:, :])[0]+mat2word(out_))
            print("In dataset: {}".format(mat2word(mat[:, :])[0]+mat2word(out_) in length_five))

Step:0 - Loss: nan
('Wanted to predict: ', 'Alice')
('Got: ', 'Aiiii')
In dataset: False
Step:10000 - Loss: 3.94866299629
('Wanted to predict: ', 'Fritz')
('Got: ', 'Frach')
In dataset: False
Step:20000 - Loss: 2.14521241188
('Wanted to predict: ', 'Lexie')
('Got: ', 'Looie')
In dataset: False
Step:30000 - Loss: 1.98657679558
('Wanted to predict: ', 'Alter')
('Got: ', 'Aldon')
In dataset: True
Step:40000 - Loss: 1.94798660278
('Wanted to predict: ', 'Claud')
('Got: ', 'Caard')
In dataset: False
Step:50000 - Loss: 1.93534576893
('Wanted to predict: ', 'Benno')
('Got: ', 'Berny')
In dataset: True
Step:60000 - Loss: 1.92206263542
('Wanted to predict: ', 'Dawne')
('Got: ', 'Daria')
In dataset: True
Step:70000 - Loss: 1.90611064434
('Wanted to predict: ', 'Norva')
('Got: ', 'Nenia')
In dataset: True
Step:80000 - Loss: 1.88536524773
('Wanted to predict: ', 'Toula')
('Got: ', 'Tesia')
In dataset: True
Step:90000 - Loss: 1.86433625221
('Wanted to predict: ', 'Enoch')
('Got: ', 'Evnrk')
In data

In [69]:
# Generate names of length 5! DOES NOT WORK CURRENTLY
import string

def softmax(distribution):
    s = np.sum(np.e**distribution)
    return [(np.e**i)/s for i in distribution]

def get_next_letter(params, letter):
    """
    Takes in all the necessary weights and an OHE letter.
    Returns output distribution and updated internal state
    """
    [whh, wxh, why, Bh, By, h_state] = params
    
    h_state = np.tanh(np.dot(letter, wxh) + np.dot(h_state, whh) + Bh)
    
    # update the output, only the last thing will predict something
    out = np.dot(h_state, why) + By
    
    return softmax(out), h_state

def generate_random_name():
    capitals = list(string.ascii_uppercase)
    
    start_letter = np.random.choice(capitals, 1)[0]
    starting_vec = word2mat(start_letter)
    
    # get all the trained weights from the graph
    [whh, wxh, why, Bh, By]  = sess.run([Whh, Wxh, Why, bh, by])
    h_state = h_last
    vec = starting_vec
    name = start_letter
    for _ in range(4):
        vec, h_state = get_next_letter([whh, wxh, why, Bh, By, h_state], vec)
        name+=dictionary[np.argmax(vec, 1)[0]]
    return name

In [93]:
generated_names = [generate_random_name() for _ in range(100)]
prop_in_dataset = np.sum([name in length_five for name in list(set(generated_names))])/len(generated_names) 

print("Proportion in original dataset: {}".format(np.round(prop_in_dataset, 2)))
print("Number of unique names: {}".format(len(list(set(generated_names)))))

Proportion in original dataset: 0.1
Number of unique names: 26


In [97]:
list(set(generated_names))

['Harsm',
 'Kyran',
 'Osair',
 'Amain',
 'Quyid',
 'Izaak',
 'Viddy',
 'Wyldd',
 'Bowen',
 'Tyran',
 'Uzien',
 'Lanin',
 'Parro',
 'Yasin',
 'Niras',
 'Daven',
 'Colen',
 'Samar',
 'Emren',
 'Romdn',
 'Gargn',
 'Makin',
 'Javin',
 'Xidgh',
 'Faiah',
 'Zaran']

Cool! Now that we made our first RNN, it is time to come make some more fancy RNNs. Perhaps we can also start using some More complex architectures using preexisting layers from Tensorflow!