# Generating names with an RNN

Edited from [tadeaspaule](https://github.com/tadeaspaule/universal-name-generator/blob/master/Universal%20RNN%20for%20name%20generation.ipynb)

---

In [35]:
import csv
import tensorflow
from tensorflow.keras.layers import LSTM, Dense, Input, concatenate, Reshape, Dropout
from tensorflow.keras.models import Model, load_model
import numpy as np

In [36]:
with open('web-scraper/mascots.csv', 'r') as f:
    reader = csv.reader(f)
    mascots = list(reader)[0]
print('# of mascots:', len(mascots))

with open('web-scraper/cities.csv', 'r') as f:
    reader = csv.reader(f)
    cities = list(reader)[0]
print('# of cities:', len(cities))

# of mascots: 5841
# of cities: 314


In [64]:
def process_names(names, *, unwanted=['(', ')', '-', '.', '/', '\xa0', '&', '!']):
    # names = [name.lower() for name in names]
    print("Total names:", len(names))
    chars = sorted(list(set(''.join(names))))

    def has_unwanted(word):
        for char in word:
            if char in unwanted:
                return True
        return False
    names = [name for name in names if not has_unwanted(name)]
    print("Amount of names after removing those with unwanted characters:", len(names))
    chars = [char for char in chars if char not in unwanted]
    print("Using the following characters:", chars)

    maxlen = max([len(name) for name in names])
    minlen = min([len(name) for name in names])
    print("Longest name is", maxlen, "characters long")
    print("Shortest name is", minlen, "characters long")
    
    # enchar indicates the end of the word
    # here it goes through unlikely-to-be-used characters to find one it can use
    endchars = '!£$%^&*()-_=+/?.>,<;:@[{}]#~'
    endchar = [ch for ch in endchars if ch not in chars][0]

    # ensures the character isn't already used & present in the training data
    assert(endchar not in chars)
    chars += endchar
    
    return names, chars

# names = cc.get_city_list(['Germany'])
names, chars = process_names(cities, unwanted=[])

Total names: 314
Amount of names after removing those with unwanted characters: 314
Using the following characters: [' ', '.', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '–']
Longest name is 16 characters long
Shortest name is 4 characters long


## 2. Getting the X - long sequences
- This model basically works by looking at X characters (in this case 4), and predicting what the next character will be
- Changing this X value will affect what patterns the model learns, if we make X too big it can simply memorize names from the dataset, but if we make it too small, it won't be able to accurately predict the next character
- I played around with it a bit and settled on 4, but feel free to try out different values (you should only have to change the value below in seqlen = 4 and the rest of the code will adjust itself based on that)

In [38]:
def make_sequences(names, seqlen):
    # To have the model learn a more macro understanding, it also takes the word's length so far as input
    sequences, lengths, nextchars = [], [], []

    for name in names:
        if len(name) <= seqlen:
            sequences.append(name + chars[-1]*(seqlen - len(name)))
            nextchars.append(chars[-1])
            lengths.append(len(name))
        else:
            for i in range(0, len(name) - seqlen + 1):
                sequences.append(name[i:i + seqlen])
                if i+seqlen < len(name):
                    nextchars.append(name[i + seqlen])
                else:
                    nextchars.append(chars[-1])
                lengths.append(i + seqlen)

    print(len(sequences), "sequences of length", seqlen, "made")
    return sequences,lengths,nextchars

seqlen = 4
sequences, lengths, nextchars = make_sequences(names, seqlen)

34948 sequences of length 4 made


## 3. One hot encoding the sequences, word lengths, and next characters
- One hot encoding means that, for example, if you have 5 characters that can appear, you turn the first character into [1 0 0 0 0], the second into [0 1 0 0 0], and so on
- We do it because this format is easy for the model to read (and we need to somehow turn the sequence strings into number values)

In [39]:
def make_onehots(*, sequences, lengths, nextchars, chars):
    x = np.zeros(shape=(len(sequences),len(sequences[0]), len(chars)), dtype='float32') # sequences
    x2 = np.zeros(shape=(len(lengths), max(lengths))) # lengths

    for i, seq in enumerate(sequences):
        for j, char in enumerate(seq):
            x[i, j, chars.index(char)] = 1.

    for i, l in enumerate(lengths):
        x2[i, l-1] = 1.

    y = np.zeros(shape=(len(nextchars),len(chars)))
    for i, char in enumerate(nextchars):
        y[i, chars.index(char)] = 1.
    
    return x, x2, y

x, x2, y = make_onehots(sequences=sequences,
                        lengths=lengths,
                        nextchars=nextchars,
                        chars=chars)

## 4. Method for generating random starting sequences
- Looks at the probabilities letters appear after each other (for example, how often is 'a' third when 'f' is second, compared to other letters that occur after a second 'f')
- We will use this later to make brand new names

In [40]:
def get_dictchars(names, seqlen):
    dictchars = [{} for _ in range(seqlen)]

    for name in names:
        if len(name) < seqlen:
            continue
        dictchars[0][name[0]] = dictchars[0].get(name[0], 0) + 1
        for i in range(1, seqlen):
            if dictchars[i].get(name[i-1], 0) == 0:
                dictchars[i][name[i-1]] = {name[i]: 1}
            elif dictchars[i][name[i-1]].get(name[i], 0) == 0:
                dictchars[i][name[i-1]][name[i]] = 1
            else:
                dictchars[i][name[i-1]][name[i]] += 1

    return dictchars
                
dictchars = get_dictchars(names, seqlen)
                
'''
What is dictchars?
Basically, stores how often a letter occurs after another letter at a specific spot in a name

dictchars[0] just stores how often each letter is first, {a: 3, b:4, etc}

dictchars[1+] store which letters (and how often) come after a certain letter.
For example, if dictchars[1]['a'] = {b:4,c:1}, that means that if 'a' was first, 
b followed 4 times, while c followed only once.

This is used in the method below to generate plausible-sounding starting sequences.
'''
    

def generate_start_seq(dictchars):
    res = "" # The starting sequence will be stored here
    p = sum([n for n in dictchars[0].values()]) # total amount of letter occurences
    r = np.random.randint(0, p) # random number used to pick the next character
    tot = 0
    for key, item in dictchars[0].items():
        if r >= tot and r < tot + item:
            res += key
            break
        else:
            tot += item

    for i in range(1, len(dictchars)):
        ch = res[-1]
        if dictchars[i].get(ch, 0) == 0:
            l = list(dictchars[i].keys())
            ch = l[np.random.randint(0, len(l))]
        p = sum([n for n in dictchars[i][ch].values()])
        r = np.random.randint(0, p)
        tot = 0
        for key, item in dictchars[i][ch].items():
            if r >= tot and r < tot + item:
                res += key
                break
            else:
                tot += item
    return res


## 5. Methods for generating text
- The methods below basically take care of 'I give X letters, I get the full name', so that we can easily monitor the progress of the model (and combined with the above-declared method that makes random starting sequences, we won't even need to provide anything to get brand new names)
- There is one concept used below called <i>temperature</i>. Basically it's a measure randomness plays when selecting the next letter, with 0 being no randomness, always picking the most likely letter, and 1 being total randomness, and the letters are chosen based on their probability value
- Adjusting this changes how your generated names look, typically the closer to 0 you are the more coherent and closely resembling the training data the output is, and the closer to 1 you are the more novel but sometimes also less coherent the output is. This mainly affects large generated texts though, not so much names. Nevertheless, I tend to go for ~0.4 temperature usually, but feel free to try out different values

In [41]:
def sample(preds, temperature=0.4):
    preds = np.asarray(preds).astype('float64')
    if temperature == 0:
        # Avoiding a division by 0 error
        return np.argmax(preds)
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_name(model, start, *, chars=chars, temperature=0.4):
    maxlength = model.layers[3].input.shape[1]
    seqlen = int(model.layers[0].input.shape[1])
    result = start
    
    sequence_input = np.zeros(shape=(1, seqlen, len(chars)))
    for i, char in enumerate(start):
        sequence_input[0, i, chars.index(char)] = 1.
    
    length_input = np.zeros(shape=(1, maxlength))
    length_input[0, len(result)-1] = 1.
    
    prediction = model.predict(x=[sequence_input, length_input])[0]
    char_index = sample(prediction, temperature)
    while char_index < len(chars)-1 and len(result) < maxlength:
        result += chars[char_index]
        
        sequence_input = np.zeros(shape=(1, seqlen, len(chars)))
        for i, char in enumerate(result[(-seqlen):]):
            sequence_input[0, i,chars.index(char)] = 1.
        
        length_input[0, len(result)-2] = 0.
        length_input[0, len(result)-1] = 1.
        
        prediction = model.predict(x=[sequence_input, length_input])[0]
        char_index = sample(prediction, temperature)
    
    return result.title()

def generate_random_name(model, *, chars=chars, dictchars=dictchars, temperature=0.4):
    start = generate_start_seq(dictchars)
    return generate_name(model, start, chars=chars, temperature=temperature)

## 6. Building the model
- Here is where you can experiment and try out different approaches
- After some testing, I went with the below setup:
    - 2 Inputs (the sequence, and the one-hot-encoded length of the name at the end of that sequence)
    - 2 parallel LSTM layers, one normal with relu, the other backwards with tanh, both with dropout 0.3
    - Concatenate the LSTM outputs with the one-hot-encoded length
    - Dense output layer with softmax activation

In [42]:
def make_model(x, x2, chars):
    inp1 = Input(shape=x.shape[1:]) # sequence input
    inp2 = Input(shape=x2.shape[1:]) # length input
    lstm = LSTM(len(chars), activation='relu', dropout=0.3)(inp1)
    lstm2 = LSTM(len(chars), dropout=0.3, go_backwards=True)(inp1)
    concat = concatenate([lstm, lstm2, inp2])
    dense = Dense(len(chars), activation='softmax')(concat)

    model = Model([inp1, inp2], dense)
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

model = make_model(x, x2, chars)

## 7. Method for training a model and monitoring its progress
- Using this makes it easy to try out different model architectures and see what names they are able to generate
- For fast prototyping, just build and compile a model, then simply:
```python
try_model(model)
```

In [43]:
def try_model(model, *, x=x, x2=x2, y=y, chars=chars, dictchars=dictchars, total_epochs=180, print_every=60, temperature=0.4 ,verbose=True):
    for i in range(total_epochs//print_every):
        history = model.fit([x, x2],
                            y, 
                            epochs=print_every,
                            batch_size=64,
                            validation_split=0.05,
                            verbose=0)
        if verbose:
            print("\nEpoch", (i+1) * print_every)
            print("First loss:            %1.4f" % (history.history['loss'][0]))
            print("Last loss:             %1.4f" % (history.history['loss'][-1]))
            print("First validation loss: %1.4f" % (history.history['val_loss'][0]))
            print("Last validation loss:  %1.4f" % (history.history['val_loss'][-1]))
            print("\nGenerating random names:")
            for _ in range(10):
                print(generate_random_name(model, chars=chars, dictchars=dictchars, temperature=temperature)) 
    if not verbose:
        print("Model training complete, here are some generated names:")
        for _ in range(20):
            print(generate_random_name(model, chars=chars, dictchars=dictchars, temperature=0.4))

## 8. And finally, training the model and seeing how it does

In [44]:
# try_model(model)

---
# Putting it all together to showcase other datasets
### This method returns a model and a method for generating more names
```python
# usage example
names = load_name_data() # load your desired name dataset
model, generate_name = train_model(names)

new_names = []
for _ in range(1000):
    new_name = generate_name()
    new_names.append(new_name)
```

In [45]:
def train_model(names, *,  seqlen=4, unwanted=['(',  ')',  '-',  '.',  '/'], verbose=True):
    names, chars = process_names(names, unwanted=unwanted)
    
    sequences, lengths, nextchars = make_sequences(names, seqlen)
    
    x, x2, y = make_onehots(sequences=sequences,
                          lengths=lengths,
                          nextchars=nextchars,
                          chars=chars)
        
    dictchars = get_dictchars(names, seqlen)
    
    model = make_model(x, x2, chars)  
    
    try_model(model, x=x, x2=x2, y=y, chars=chars, dictchars=dictchars, verbose=verbose)
    
    def generate():
        return generate_random_name(model, chars=chars, dictchars=dictchars, temperature=0.4)
    
    print('Done training.')
    return model, generate

In [46]:
# Train mascot model
mascot_model, mascot_gen = train_model(mascots, unwanted=['(', ')', '-', '.', '/', '\xa0', '&', '!'], verbose=False)

Total names: 5841
Amount of names after removing those with unwanted characters: 5677
Using the following characters: [' ', "'", '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Longest name is 27 characters long
Shortest name is 2 characters long
34948 sequences of length 4 made
Model training complete, here are some generated names:
Hurbons
Steffclowbors
Pells
Rolchts
Fubrons
Kag Hornets
Daves
Lighting Blues
Caskers
Zighinas
Chewks
Haspbuts
Grain Bears
Ziver Pirates
Spants
Noyralds
Sorballers
Paeers
Grints
Groomers
Done training.


TypeError: generate_name() missing 2 required positional arguments: 'model' and 'start'

In [65]:
# Train mascot model
city_model, city_gen = train_model(cities, unwanted=[], verbose=False)

Total names: 314
Amount of names after removing those with unwanted characters: 314
Using the following characters: [' ', '.', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '–']
Longest name is 16 characters long
Shortest name is 4 characters long
1811 sequences of length 4 made
Model training complete, here are some generated names:
Oxn Valley
Sprrngfield
Burinas
Mantion
Miley
Galand Rapids
Mexington
Shester
Nolida
Ellton
Rorwalk
Scernsville
Atlentown
Poille
Hompton
Dorwalk
Dalarmon
Nen Orlen
Norrane
Facoland
Done training.


In [56]:
mascot_model.save('mascot_model.h5')
mascot_model.summary()
# new_model = tf.keras.models.load_model('mascot_model.h5')

In [66]:
city_model.save('city_model.h5')
city_model.summary()
# new_model = tf.keras.models.load_model('city_model.h5')

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 4, 52)]      0                                            
__________________________________________________________________________________________________
lstm_8 (LSTM)                   (None, 52)           21840       input_9[0][0]                    
__________________________________________________________________________________________________
lstm_9 (LSTM)                   (None, 52)           21840       input_9[0][0]                    
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 16)]         0                                            
____________________________________________________________________________________________

In [141]:
new_mascots = []
for _ in range(20):
    new_mascots.append(mascot_gen())
print(*new_mascots, sep="\n")

Reaners
Poorest Lakes
Righlanders
Cooses
Brockers
Bichards
Rars
Rostling Train
Late Vikes
Frest
Guld Riders
Genicans
Fiolden Warriors
Angulas
Jalos
Rowks
Dritters
Oridgets
Glken Devils
Arimen


In [138]:
new_cities = []
for _ in range(20):
    new_cities.append(city_gen())
print(*new_cities, sep="\n")

Kiten
Kano
Couna Valle
Piringfield
Lemerton
Falens
Virkson
Lavelley
Daison
Vincester
Canta Rouge
Starkan
Bartin
Danco
Noshas
Lelburgh
Ellley
Harwalk
Grlleton
Woleyword
