In [1]:
import pandas as pd
import numpy as np
from get_vocabulary import get_vocabulary
from get_max_len import get_max_len

In [2]:
names_df = pd.read_csv('data/names.txt', sep="\n", header=None)
names_df.columns=['input']
names_df.head()

Unnamed: 0,input
0,John
1,William
2,James
3,Charles
4,George


### Preprocess names dataset

In [3]:
# Insert a tab in front of all the names
names_df['input'] = names_df['input'].apply(lambda x : '\t' + x)

# Append a newline at the end of every name
# We already appended a tab in front, so the target word should start at index 1
names_df['target'] = names_df['input'].apply(lambda x : x[1:len(x)] + '\n')
names_df.head()

Unnamed: 0,input,target
0,\tJohn,John\n
1,\tWilliam,William\n
2,\tJames,James\n
3,\tCharles,Charles\n
4,\tGeorge,George\n


- Now we have a DataFrame with two columns containing the names with the start and end tokens appended. The next step is to encode these as numeric values because machine learning models only accept numeric inputs.
- create two dictionaries, char_to_idx and idx_to_char, that will contain mappings of characters to integers, e.g., {'\t': 0, '\n': 1, 'a': 2, 'b': 3, ...} and the reverse mappings of integers to characters, e.g, {0: '\t', 1: '\n', 2: 'a', 3: 'b', ...}.

In [4]:
# Get the vocabulary
vocabulary = get_vocabulary(names_df['input'])

# Sort the vocabulary
vocabulary_sorted = sorted(vocabulary)

# Create the mapping of the vocabulary chars to integers
char_to_idx = { char : idx for idx, char in enumerate(vocabulary_sorted) }

# Create the mapping of the integers to vocabulary chars
idx_to_char = { idx : char for idx, char in enumerate(vocabulary_sorted) }

# Print the dictionaries
print(char_to_idx)
print(idx_to_char)

{'\t': 0, '\n': 1, 'A': 2, 'B': 3, 'C': 4, 'D': 5, 'E': 6, 'F': 7, 'G': 8, 'H': 9, 'I': 10, 'J': 11, 'K': 12, 'L': 13, 'M': 14, 'N': 15, 'O': 16, 'P': 17, 'Q': 18, 'R': 19, 'S': 20, 'T': 21, 'U': 22, 'V': 23, 'W': 24, 'X': 25, 'Y': 26, 'Z': 27, 'a': 28, 'b': 29, 'c': 30, 'd': 31, 'e': 32, 'f': 33, 'g': 34, 'h': 35, 'i': 36, 'j': 37, 'k': 38, 'l': 39, 'm': 40, 'n': 41, 'o': 42, 'p': 43, 'q': 44, 'r': 45, 's': 46, 't': 47, 'u': 48, 'v': 49, 'w': 50, 'x': 51, 'y': 52, 'z': 53}
{0: '\t', 1: '\n', 2: 'A', 3: 'B', 4: 'C', 5: 'D', 6: 'E', 7: 'F', 8: 'G', 9: 'H', 10: 'I', 11: 'J', 12: 'K', 13: 'L', 14: 'M', 15: 'N', 16: 'O', 17: 'P', 18: 'Q', 19: 'R', 20: 'S', 21: 'T', 22: 'U', 23: 'V', 24: 'W', 25: 'X', 26: 'Y', 27: 'Z', 28: 'a', 29: 'b', 30: 'c', 31: 'd', 32: 'e', 33: 'f', 34: 'g', 35: 'h', 36: 'i', 37: 'j', 38: 'k', 39: 'l', 40: 'm', 41: 'n', 42: 'o', 43: 'p', 44: 'q', 45: 'r', 46: 's', 47: 't', 48: 'u', 49: 'v', 50: 'w', 51: 'x', 52: 'y', 53: 'z'}


In [68]:
print(vocabulary)

{'V', 'O', 'X', 'p', 'v', 'c', 'L', 'U', 'Q', 'A', 'z', 'Z', 'R', 'g', 'W', 'D', 'j', 'o', 'a', 'y', 'f', 's', 'Y', 'B', 'b', 'd', 'x', 'i', 'h', 'e', 'G', 'I', 'M', 'T', 't', 'K', 'w', 'l', 'm', 'u', 'S', 'E', 'k', 'J', 'P', 'q', 'r', 'F', 'H', '\t', 'n', '\n', 'N', 'C'}


### Create input and target tensors
- The input is a list containing all the names in the dataset. So, the first dimension of the input tensor will be the number of names in the dataset. Each name can be thought of as a string having length equal to the length of the longest name and each character in each name is a one-hot encoded vector of size vocabulary. So, the second and third dimensions of the input tensor will be the length of the longest name and the size of the vocabulary. Similar is the case for the target tensor.

In [23]:
# Find the length of longest name
max_len = get_max_len(names_df['input'])

# Initialize the input vector
input_data = np.zeros((len(names_df['input']), max_len+1, len(vocabulary)), dtype='float32')

# Initialize the target vector
target_data = np.zeros((len(names_df['input']), max_len+1, len(vocabulary)), dtype='float32')

print(input_data.shape, target_data.shape, max_len)

(258000, 13, 54) (258000, 13, 54) 12


##### Now we have two vectors of appropriate shape which we can fill up with actual data. These vectors can then be fed to the recurrent neural network.

#### Initialize input and target vectors with values
- We created the input and target tensors of appropriate shape containing all zeros. Now, we'll fill these with actual values. The input and target tensors contain all the names in the dataset. Each name can be thought of as a string having length equal to the length of the longest name and each character in each name is a one-hot encoded vector of size vocabulary.
- The tensors can be filled-in as follows: `input_data[n_idx, p_idx, char_to_idx[char]]` will be set to 1 whenever the index of the name in the dataset is `n_idx` and it contains the character char in position `p_idx`

#### Example 3-d array

In [33]:
a = np.zeros((11,12,12))

In [36]:
a[1,1,10] = 1
a[1]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [37]:
# Iterate for each name in the dataset
for n_idx, name in enumerate(names_df['input']):
  # Iterate over each character and convert it to a one-hot encoded vector
  for c_idx, char in enumerate(name):
    input_data[n_idx, c_idx, char_to_idx[char]] = 1

# Iterate for each name in the dataset
for n_idx, name in enumerate(names_df['target']):
  # Iterate over each character and convert it to a one-hot encoded vector
  for c_idx, char in enumerate(name):
    target_data[n_idx, c_idx, char_to_idx[char]] = 1

##### Now we have the input and target vectors of appropriate shape. We can use these vectors to train the recurrent neural network.

### Build and compile RNN network
- We completed all the data preprocessing steps and have the input and target vectors ready. It is time to build the recurrent neural network. We'll create a small network architecture that will have 50 simple RNN nodes in the first layer followed by a dense layer. The dense layer will generate a probability distribution over the vocabulary for the next character. So, the size of the dense layer will be the same as the size of the vocabulary.

In [41]:
from keras.models import Sequential
from keras.layers import TimeDistributed, SimpleRNN, Dense, Activation

In [42]:
# Create a Sequential model
model = Sequential()

# Add SimpleRNN layer of 50 units
model.add(SimpleRNN(50, input_shape=(max_len+1, len(vocabulary)), return_sequences=True))

# Add a TimeDistributed Dense layer of size same as the vocabulary
model.add(TimeDistributed(Dense(len(vocabulary), activation='softmax')))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, 13, 50)            5250      
_________________________________________________________________
time_distributed (TimeDistri (None, 13, 54)            2754      
Total params: 8,004
Trainable params: 8,004
Non-trainable params: 0
_________________________________________________________________


##### Built and compiled the recurrent neural network model successfully! This model can be trained now.

### Train RNN model and start predictions
- The output name will be generated character by character. The first character in each name was the start token \t. We'll feed the start token to the trained model to output a probability distribution over the vocabulary which can be sampled to generate the next character.

In [43]:
# Fit the model for 5 epochs using a batch size of 128 
model.fit(input_data, target_data, batch_size=128, epochs=5)

# Create a 3-D zero vector and initialize it with the start token
output_seq = np.zeros((1, max_len+1, len(vocabulary)))
output_seq[0, 0, char_to_idx['\t']] = 1

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [59]:
probs = model.predict_proba(output_seq, verbose=0)

In [64]:
probs

array([[[6.44886222e-06, 2.25875177e-04, 7.77585059e-02, 4.79646400e-02,
         7.70975202e-02, 6.86370805e-02, 6.54280111e-02, 2.64541302e-02,
         4.15442139e-02, 2.86503807e-02, 1.86169166e-02, 7.68048912e-02,
         3.57826762e-02, 7.41821826e-02, 9.27876681e-02, 2.50487886e-02,
         1.75261572e-02, 2.10567955e-02, 1.74338289e-03, 6.52880743e-02,
         4.95353378e-02, 3.43088247e-02, 1.88910135e-03, 2.04495974e-02,
         2.21899226e-02, 4.37302078e-04, 2.35137180e-03, 4.66359593e-03,
         1.70402011e-04, 4.63752513e-05, 4.88802034e-05, 2.01726834e-05,
         3.96271898e-05, 7.40747564e-05, 4.06480176e-05, 1.35272276e-04,
         4.36124901e-05, 4.85940518e-06, 4.83805816e-05, 5.77948558e-05,
         5.62153728e-05, 6.16438410e-05, 1.60677344e-04, 1.38425574e-04,
         1.91800991e-05, 2.40046847e-05, 8.55584512e-05, 5.45861585e-05,
         6.26141991e-05, 8.33620470e-06, 3.04365258e-05, 3.50303562e-05,
         5.91788739e-05, 4.45097103e-05],
        [

In [65]:
# Get the probabilities for the first character
probs = model.predict_proba(output_seq, verbose=0)[:,1,:] # 1st row

In [66]:
probs

array([[7.4102850e-06, 2.2606064e-04, 6.2644875e-05, 6.6788671e-05,
        7.1073460e-05, 4.5492612e-05, 6.9765549e-05, 5.0482333e-05,
        4.3558102e-05, 4.6585606e-05, 4.6930520e-05, 6.6061453e-05,
        4.4910106e-05, 5.6830148e-05, 5.1274965e-05, 3.6663372e-05,
        4.4961020e-05, 4.5103086e-05, 2.3888684e-05, 6.3395280e-05,
        5.5656961e-05, 6.8474779e-05, 3.7249261e-05, 3.9502847e-05,
        6.7801571e-05, 1.6330923e-05, 2.5700385e-05, 4.3702363e-05,
        3.0706814e-01, 5.1340560e-04, 1.3367083e-03, 1.0707421e-03,
        2.0620199e-01, 5.2815951e-05, 2.1020668e-04, 1.1072043e-02,
        1.1760117e-01, 2.0949004e-04, 2.5907057e-04, 1.7947897e-02,
        6.4343127e-04, 9.8069827e-04, 1.4186715e-01, 1.7647898e-04,
        9.0394031e-05, 5.7070293e-02, 9.1224239e-04, 1.0422906e-03,
        9.9432737e-02, 7.6193165e-04, 5.1733088e-03, 1.2483254e-04,
        2.6241088e-02, 4.1514487e-04]], dtype=float32)

### numpy.random.choice

```
 numpy.random.choice(a, size=None, replace=True, p=None)¶
 
Parameters

    a :1-D array-like or int
        If an ndarray, a random sample is generated from its elements. If an int, the random sample is generated as if a were np.arange(a) sizeint or tuple of ints, optional

Output shape : If the given shape is, e.g., (m, n, k), then m * n * k samples are drawn. Default is None, in which case a single value is returned.
    replace: boolean, optional

        Whether the sample is with or without replacement
    p : 1-D array-like, optional

        The probabilities associated with each entry in a. If not given the sample assumes a uniform distribution over all entries in a.
```


In [67]:
# Sample vocabulary to get first character
first_char = np.random.choice(sorted(vocabulary), replace=False, p=probs.reshape(len(vocabulary)))

# Print the character genaerated
print(first_char)

i


##### Now we know how to train the RNN model and generate the first character given the seed character as input. We'll use this character to generate the next character

### Generate baby names
- generate the second character by feeding the start token and the generated first character again to the trained network. We'll also be generating full names starting from the start token and repeating this process until the end token is found.

In [69]:
# Print the first character which we got last time
print(first_char)

# Update the vector to contain first the character
output_seq[0, 1, char_to_idx[first_char]] = 1

# Get the probabilities for the second character
probs = model.predict_proba(output_seq, verbose=0)[:,2,:]

# Sample vocabulary to get second character
second_char = np.random.choice(sorted(vocabulary), replace=False, p=probs.reshape(len(vocabulary)))

# Print the second character
print(second_char)

i
l


In [72]:
def generate_baby_names(n):
    for i in range(0,n):
        stop=False
        counter=1
        name=''
        # initialize first char of output seq
        output_seq=np.zeros((1,max_len+1,len(vocabulary)))
        output_seq[0,0,char_to_idx['\t']] = 1
        # continue until a newline is generated or max no of chars reached
        while stop==False and counter<10:
            # get prob distribution for next char
            probs = model.predict_proba(output_seq, verbose=0)[:,counter-1,:]
            # sample vocab to get most probable next char
            c = np.random.choice(sorted(list(vocabulary)), replace=False, p=probs.reshape(len(vocabulary)))
            if c=='\n':
                stop=True
            else:
                name = name + c
                output_seq[0,counter,char_to_idx[c]] = 1
                counter+= 1
        print(name)

In [74]:
generate_baby_names(5)

Rudon
Ungarkey
Lyndo
Loley
Edsi
